Setup and reproducibility¶

In [ ]:
# Box2D support for LunarLander and BipedalWalker
!pip -q install swig
!pip -q install "gymnasium[box2d]"
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 29.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 374.4/374.4 kB 8.3 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
  Building wheel for box2d-py (setup.py) ... done

Imports

In [ ]:
import os
import time
import random
from collections import deque

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

import gymnasium as gym
import matplotlib.pyplot as plt

from torch.distributions import Categorical, Normal

Reproducibility

In [ ]:
# Global seed
SEED = 1227
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device} (SEED={SEED})")
Using device: cpu (SEED=1227)

Artifact directories

In [ ]:
# Base directory for all PPO bonus artifacts
BONUS_ARTIFACTS_ROOT = "a3_bonus_ppo_artifacts"

# Environment-specific subfolders
LUNAR_ENV_NAME = "lunar_lander"
ACROBOT_ENV_NAME = "acrobot"
BIPEDAL_ENV_NAME = "bipedal_walker"

LUNAR_ROOT = os.path.join(BONUS_ARTIFACTS_ROOT, LUNAR_ENV_NAME)
ACROBOT_ROOT = os.path.join(BONUS_ARTIFACTS_ROOT, ACROBOT_ENV_NAME)
BIPEDAL_ROOT = os.path.join(BONUS_ARTIFACTS_ROOT, BIPEDAL_ENV_NAME)

# Create base folders up front
os.makedirs(LUNAR_ROOT, exist_ok=True)
os.makedirs(ACROBOT_ROOT, exist_ok=True)
os.makedirs(BIPEDAL_ROOT, exist_ok=True)

def make_run_dir(env_root: str, run_name: str) -> str:
    run_dir = os.path.join(env_root, run_name)
    os.makedirs(run_dir, exist_ok=True)

    # subfolder for videos inside each run
    video_dir = os.path.join(run_dir, "videos")
    os.makedirs(video_dir, exist_ok=True)

    return run_dir

print(f"Artifacts root: {BONUS_ARTIFACTS_ROOT}")
print(f"  LunarLander dir: {LUNAR_ROOT}")
print(f"  Acrobot dir:     {ACROBOT_ROOT}")
print(f"  Bipedal dir:     {BIPEDAL_ROOT}")
Artifacts root: a3_bonus_ppo_artifacts
  LunarLander dir: a3_bonus_ppo_artifacts/lunar_lander
  Acrobot dir:     a3_bonus_ppo_artifacts/acrobot
  Bipedal dir:     a3_bonus_ppo_artifacts/bipedal_walker

1. Environment factories and seeding¶

This section defines helper functions that create environments for training and evaluation. The same utilities are used for LunarLander, Acrobot, and BipedalWalker so that all experiments follow a consistent setup. Each environment receives a deterministic seed, which keeps runs reproducible while still allowing different workers to explore distinct trajectories if multiple processes are used. Evaluation environments use a fixed seed for stable comparisons across algorithms and runs.

In [ ]:
from gymnasium.wrappers import RecordVideo

# Environment IDs
LUNAR_ENV_ID = "LunarLander-v3"
ACROBOT_ENV_ID = "Acrobot-v1"
BIPEDAL_ENV_ID = "BipedalWalker-v3"

def worker_seed(worker_id: int, base_seed: int = SEED) -> int:
    return int(base_seed + 10_000 * worker_id)


def set_env_seed(env: gym.Env, seed: int):
    obs, info = env.reset(seed=seed)
    if hasattr(env.action_space, "seed"):
        env.action_space.seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    return obs, info


def make_env(env_id: str, worker_id: int = 0, base_seed: int = SEED, render_mode: str | None = None) -> gym.Env:
    env = gym.make(env_id, render_mode=render_mode)
    _ = set_env_seed(env, worker_seed(worker_id, base_seed))
    return env


def make_eval_env(env_id: str, base_seed: int = SEED, render_mode: str | None = None) -> gym.Env:
    env = gym.make(env_id, render_mode=render_mode)
    _ = set_env_seed(env, base_seed)
    return env


def make_video_env(env_id: str, video_dir: str, base_seed: int = SEED) -> gym.Env:
    os.makedirs(video_dir, exist_ok=True)
    env = gym.make(env_id, render_mode="rgb_array")
    env = RecordVideo(
        env,
        video_folder=video_dir,
        episode_trigger=lambda ep_id: True,  # record every episode used with this env
        name_prefix="ppo_bonus_eval",
    )
    _ = set_env_seed(env, base_seed)
    return env

2. Logging and plotting utilities¶

This section defines small utility functions for tracking episode returns and producing learning curves. The same helpers are used for LunarLander, Acrobot, and BipedalWalker so that all plots share a consistent style. A simple moving average is applied to highlight the long term trend in performance, and each figure is saved into the directory associated with a specific run.

In [ ]:
def moving_average(x, window: int = 20):
    x = np.asarray(x, dtype=np.float32)
    if x.size == 0:
        return x
    if window <= 1:
        return x

    if x.size < window:
        # Not enough points to compute a full window
        return np.full_like(x, np.nan, dtype=np.float32)

    kernel = np.ones(window, dtype=np.float32) / float(window)
    ma = np.convolve(x, kernel, mode="valid")
    pad = np.full(window - 1, np.nan, dtype=np.float32)
    return np.concatenate([pad, ma])


def plot_rewards(
    rewards,
    run_dir: str,
    filename: str,
    title: str,
    ma_window: int = 20,
):
    rewards = np.asarray(rewards, dtype=np.float32)
    ma = moving_average(rewards, window=ma_window)

    plt.figure(figsize=(8, 4.5))
    plt.plot(rewards, label="Episode return", alpha=0.4)
    plt.plot(ma, label=f"MA({ma_window})")
    plt.xlabel("Episode")
    plt.ylabel("Return")
    plt.title(title)
    plt.legend()
    plt.tight_layout()

    os.makedirs(run_dir, exist_ok=True)
    out_path = os.path.join(run_dir, filename)
    plt.savefig(out_path)
    plt.show()
    plt.close()

    print(f"Saved plot to {out_path}")

def plot_eval_returns(
    returns,
    run_dir: str,
    filename: str,
    title: str,
):
    returns = np.asarray(returns, dtype=np.float32)

    plt.figure(figsize=(8, 4.5))
    plt.plot(range(len(returns)), returns, marker="o")
    plt.xlabel("Episode")
    plt.ylabel("Return")
    plt.title(title)
    plt.tight_layout()

    os.makedirs(run_dir, exist_ok=True)
    out_path = os.path.join(run_dir, filename)
    plt.savefig(out_path)
    plt.show()
    plt.close()

    print(f"Saved eval plot to {out_path}")

3. PPO actor–critic networks¶

This section defines actor–critic networks that share a common feedforward torso and attach different output heads for discrete and continuous control. Subsection 3.1 builds the network for discrete action spaces such as LunarLander and Acrobot, and subsection 3.2 defines the variant for continuous actions used in BipedalWalker.

3.1 Discrete PPO actor–critic network¶

This cell defines an actor–critic network for discrete action spaces, used by LunarLander and Acrobot. A two layer multilayer perceptron maps the observation vector to a latent representation. A policy head outputs logits over the available actions and a value head predicts the state value. Orthogonal initialization is applied to the linear layers, with a smaller gain on the policy head for more stable behaviour at the start of training and unit gain on the value head for value regression. A configuration dataclass stores observation dimension, action count, and hidden sizes so that the same architecture can be recreated for different runs. Helper methods construct a categorical policy distribution from logits and a greedy action method selects the action with highest logit for deterministic evaluation.

In [ ]:
from dataclasses import dataclass
from torch.distributions import Categorical


@dataclass
class PPODiscreteModelConfig:
    obs_dim: int
    n_actions: int
    hidden_sizes: tuple = (256, 256)


class PPOActorCriticDiscrete(nn.Module):
    """
    Actor–critic network for discrete action spaces (PPO).
    """

    def __init__(self, obs_dim: int, n_actions: int, hidden_sizes=(256, 256)):
        super().__init__()
        h1, h2 = hidden_sizes

        self.torso = nn.Sequential(
            nn.Linear(obs_dim, h1),
            nn.ReLU(),
            nn.Linear(h1, h2),
            nn.ReLU(),
        )
        self.policy_head = nn.Linear(h2, n_actions)
        self.value_head = nn.Linear(h2, 1)

        # Orthogonal initialization
        for m in self.modules():
            if isinstance(m, nn.Linear) and m not in (self.policy_head, self.value_head):
                nn.init.orthogonal_(m.weight, gain=nn.init.calculate_gain("relu"))
                nn.init.zeros_(m.bias)
        nn.init.orthogonal_(self.policy_head.weight, gain=0.01)
        nn.init.zeros_(self.policy_head.bias)
        nn.init.orthogonal_(self.value_head.weight, gain=1.0)
        nn.init.zeros_(self.value_head.bias)

    def forward(self, x: torch.Tensor):
        x = x.float()
        z = self.torso(x)
        logits = self.policy_head(z)
        value = self.value_head(z).squeeze(-1)
        return logits, value

    @staticmethod
    def dist_from_logits(logits: torch.Tensor) -> Categorical:
        return Categorical(logits=logits)

    @torch.no_grad()
    def greedy_action(self, obs_np: np.ndarray) -> int:
        self.eval()
        device = next(self.parameters()).device
        x = torch.tensor(obs_np, dtype=torch.float32, device=device).unsqueeze(0)
        logits, _ = self(x)
        return int(torch.argmax(logits, dim=1).item())


def build_ppo_discrete_model_from_config(cfg: PPODiscreteModelConfig) -> PPOActorCriticDiscrete:
    return PPOActorCriticDiscrete(cfg.obs_dim, cfg.n_actions, cfg.hidden_sizes)

3.2 Continuous PPO actor–critic network¶

This cell defines an actor–critic network for continuous action spaces, used by BipedalWalker. As in the discrete case, a two layer multilayer perceptron maps the observation vector to a latent representation. A Gaussian policy head outputs the mean action vector and a trainable log standard deviation parameter, while a value head predicts the state value. The same orthogonal initialization scheme is used, with a smaller gain on the policy head and unit gain on the value head. A configuration dataclass records the observation dimension, action dimension, and hidden sizes so that the architecture can be reconstructed for different runs. Helper methods construct a Normal policy distribution from the mean and standard deviation, and a greedy action method returns the mean action, which will later be clipped to the valid action range by the environment wrapper.

In [ ]:
from dataclasses import dataclass
from torch.distributions import Normal


@dataclass
class PPOContinuousModelConfig:
    obs_dim: int
    act_dim: int
    hidden_sizes: tuple = (256, 256)


class PPOActorCriticContinuous(nn.Module):
    """
    Actor–critic network for continuous action spaces with a Gaussian policy (PPO).
    """

    def __init__(self, obs_dim: int, act_dim: int, hidden_sizes=(256, 256)):
        super().__init__()
        h1, h2 = hidden_sizes

        self.torso = nn.Sequential(
            nn.Linear(obs_dim, h1),
            nn.ReLU(),
            nn.Linear(h1, h2),
            nn.ReLU(),
        )
        self.mu_head = nn.Linear(h2, act_dim)
        self.value_head = nn.Linear(h2, 1)

        # Log standard deviation parameter per action dimension
        self.log_std = nn.Parameter(torch.zeros(act_dim))

        # Orthogonal initialization
        for m in self.modules():
            if isinstance(m, nn.Linear) and m not in (self.mu_head, self.value_head):
                nn.init.orthogonal_(m.weight, gain=nn.init.calculate_gain("relu"))
                nn.init.zeros_(m.bias)
        nn.init.orthogonal_(self.mu_head.weight, gain=0.01)
        nn.init.zeros_(self.mu_head.bias)
        nn.init.orthogonal_(self.value_head.weight, gain=1.0)
        nn.init.zeros_(self.value_head.bias)

    def forward(self, x: torch.Tensor):
        x = x.float()
        z = self.torso(x)
        mu = self.mu_head(z)
        value = self.value_head(z).squeeze(-1)
        std = torch.exp(self.log_std).expand_as(mu)
        return mu, std, value

    def dist_from_params(self, mu: torch.Tensor, std: torch.Tensor) -> Normal:
        return Normal(loc=mu, scale=std)

    @torch.no_grad()
    def greedy_action(self, obs_np: np.ndarray) -> np.ndarray:
        self.eval()
        device = next(self.parameters()).device
        x = torch.tensor(obs_np, dtype=torch.float32, device=device).unsqueeze(0)
        mu, std, _ = self(x)
        action = mu.squeeze(0).cpu().numpy()
        return action


def build_ppo_continuous_model_from_config(cfg: PPOContinuousModelConfig) -> PPOActorCriticContinuous:
    return PPOActorCriticContinuous(cfg.obs_dim, cfg.act_dim, cfg.hidden_sizes)

4. PPO loss and update helper¶

This section defines a helper that performs Proximal Policy Optimization (PPO) updates from a batch of rollout data. The helper takes a model, an optimizer, states, actions, stored log probabilities under the old policy, bootstrapped returns, and advantages. For discrete control it uses a categorical policy distribution built from logits, and for continuous control it uses a Normal policy distribution built from the mean and standard deviation. Advantages can be normalised to reduce variance. The clipped PPO objective restricts policy updates so that the new policy does not move too far away from the old policy in a single step. The combined loss trades off return maximisation, value fit, and entropy regularisation. Gradients are clipped to a maximum norm for stability, and several epochs of minibatch updates are performed over the same rollout. This helper is shared by LunarLander, Acrobot, and BipedalWalker, and the caller only needs to specify whether the update is discrete or continuous.

In [ ]:
from typing import Dict, Literal
from dataclasses import dataclass


@dataclass
class PPOUpdateConfig:
    """
    Hyperparameters for a single PPO update over a rollout.
    """
    clip_range: float = 0.2
    value_coef: float = 0.5
    entropy_coef: float = 0.01
    max_grad_norm: float = 0.5
    n_epochs: int = 4
    batch_size: int = 64
    normalize_adv: bool = True


def ppo_update(
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    s: torch.Tensor,           # (N, obs_dim)
    a: torch.Tensor,           # (N,) for discrete or (N, act_dim) for continuous
    logp_old: torch.Tensor,    # (N,) log probabilities under the behaviour policy
    returns: torch.Tensor,     # (N,) bootstrapped returns
    advantages: torch.Tensor,  # (N,) raw advantages
    *,
    control_type: Literal["discrete", "continuous"],
    cfg: PPOUpdateConfig,
) -> Dict[str, float]:
    model.train()

    N = s.shape[0]

    # normalize advantages
    adv = advantages.clone()
    if cfg.normalize_adv:
        adv = (adv - adv.mean()) / (adv.std() + 1e-8)

    # Flatten everything to (N, ...)
    s = s.view(N, -1)
    logp_old = logp_old.view(N)
    returns = returns.view(N)
    adv = adv.view(N)

    # Storage for logging
    total_loss = 0.0
    total_policy_loss = 0.0
    total_value_loss = 0.0
    total_entropy = 0.0
    total_kl = 0.0
    total_clip_fraction = 0.0
    n_batches = 0

    for epoch in range(cfg.n_epochs):
        # Create a random permutation and iterate over mini batches
        idx = torch.randperm(N, device=s.device)
        for start in range(0, N, cfg.batch_size):
            end = start + cfg.batch_size
            batch_idx = idx[start:end]
            if batch_idx.numel() == 0:
                continue

            sb = s[batch_idx]
            ab = a[batch_idx]
            logp_old_b = logp_old[batch_idx]
            returns_b = returns[batch_idx]
            adv_b = adv[batch_idx]

            # Forward pass and new distribution
            if control_type == "discrete":
                logits, values = model(sb)                 # PPO Actor Critic Discrete
                dist = model.dist_from_logits(logits)
                logp_new = dist.log_prob(ab)              # (B,)
                entropy = dist.entropy().mean()
            elif control_type == "continuous":
                mu, std, values = model(sb)                # PPO Actor Critic Continuous
                dist = model.dist_from_params(mu, std)
                logp_new = dist.log_prob(ab).sum(dim=-1)  # (B,)
                entropy = dist.entropy().sum(dim=-1).mean()
            else:
                raise ValueError(f"Unknown control_type {control_type}")

            values = values.view(-1)

            # Probability ratio for PPO
            ratio = torch.exp(logp_new - logp_old_b)      # (B,)

            # Clipped surrogate objective
            unclipped = ratio * adv_b
            clipped = torch.clamp(ratio, 1.0 - cfg.clip_range, 1.0 + cfg.clip_range) * adv_b
            policy_loss = -torch.min(unclipped, clipped).mean()

            # Value loss
            value_loss = 0.5 * (returns_b - values).pow(2).mean()

            # Total loss
            loss = policy_loss + cfg.value_coef * value_loss - cfg.entropy_coef * entropy

            # Backward and gradient clipping
            optimizer.zero_grad()
            loss.backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
            optimizer.step()

            # Approximate KL divergence and clip fraction for monitoring
            with torch.no_grad():
                approx_kl = (logp_old_b - logp_new).mean().abs()
                clip_fraction = (torch.abs(ratio - 1.0) > cfg.clip_range).float().mean()

            # Accumulate statistics
            total_loss += float(loss.item())
            total_policy_loss += float(policy_loss.item())
            total_value_loss += float(value_loss.item())
            total_entropy += float(entropy.item())
            total_kl += float(approx_kl.item())
            total_clip_fraction += float(clip_fraction.item())
            n_batches += 1

    if n_batches == 0:
        return {
            "loss": 0.0,
            "policy_loss": 0.0,
            "value_loss": 0.0,
            "entropy": 0.0,
            "approx_kl": 0.0,
            "clip_fraction": 0.0,
        }

    return {
        "loss": total_loss / n_batches,
        "policy_loss": total_policy_loss / n_batches,
        "value_loss": total_value_loss / n_batches,
        "entropy": total_entropy / n_batches,
        "approx_kl": total_kl / n_batches,
        "clip_fraction": total_clip_fraction / n_batches,
    }

5. Returns and advantage computation¶

This section defines a helper function that turns a sequence of rewards, done flags, and predicted state values into bootstrapped returns and advantages. The computation runs backwards in time, applies the discount factor until a terminal step, and uses a final bootstrap value when the rollout is truncated by the time horizon. The result is a pair of tensors for returns and advantages that can be passed directly into the PPO update helper. The same logic applies to LunarLander, Acrobot, and BipedalWalker, so this function is shared across environments.

In [ ]:
def compute_returns_and_advantages(
    rewards: np.ndarray,
    dones: np.ndarray,
    values: torch.Tensor,
    last_value: float,
    gamma: float,
    device: torch.device,
) -> tuple[torch.Tensor, torch.Tensor]:

    T = len(rewards)
    returns = np.zeros(T, dtype=np.float32)
    g = float(last_value)

    for t in reversed(range(T)):
        g = rewards[t] + gamma * g * (1.0 - float(dones[t]))
        returns[t] = g

    G = torch.tensor(returns, dtype=torch.float32, device=device)
    A = G - values.detach()
    return G, A

6. PPO training loop for a single environment¶

This section defines a training loop for the Proximal Policy Optimization (PPO) algorithm that works for any single environment. The function creates an environment, runs rollout segments of fixed length, collects states, actions, rewards, done flags, value predictions, and log probabilities under the behaviour policy, and then computes bootstrapped returns and advantages. After each segment, it applies several epochs of PPO updates using the shared ppo_update helper.

Episode returns are tracked in a list so that total reward per episode can be plotted later with the shared plotting function. The control_type argument switches between discrete and continuous policies so that the same loop can train LunarLander, Acrobot, and BipedalWalker with the corresponding PPO actor–critic networks.

In [ ]:
from typing import Literal, Dict, Any, List


def train_ppo_single_env(
    env_id: str,
    model: nn.Module,
    control_type: Literal["discrete", "continuous"],
    run_dir: str,
    total_env_steps: int = 200_000,
    rollout_len: int = 2048,
    gamma: float = 0.99,
    ppo_cfg: PPOUpdateConfig | None = None,
    lr: float = 3e-4,
    log_every: int = 10_000,
) -> tuple[nn.Module, List[float], Dict[str, Any]]:
    """
    Train PPO on a single environment and return model, episode returns, and logs.
    """
    os.makedirs(run_dir, exist_ok=True)

    if ppo_cfg is None:
        ppo_cfg = PPOUpdateConfig()

    optimizer = optim.Adam(model.parameters(), lr=lr)
    env = make_env(env_id, worker_id=0, base_seed=SEED)

    episode_returns: List[float] = []
    current_return = 0.0
    steps_total = 0
    iteration = 0
    last_log_steps = 0
    wall_start = time.time()

    obs, info = env.reset()
    done = False
    truncated = False

    while steps_total < total_env_steps:
        iteration += 1

        # Rollout buffers for this segment
        states = []
        actions = []
        rewards = []
        dones = []
        values = []
        log_probs = []

        for t in range(rollout_len):
            # Store state
            states.append(np.array(obs, copy=True))

            # Forward pass and action sampling
            obs_tensor = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)

            if control_type == "discrete":
                logits, value = model(obs_tensor)                 # PPO Actor Critic Discrete
                dist = model.dist_from_logits(logits)
                action_tensor = dist.sample()
                log_prob_tensor = dist.log_prob(action_tensor)
                action = int(action_tensor.item())
                log_prob = float(log_prob_tensor.item())
            elif control_type == "continuous":
                mu, std, value = model(obs_tensor)                 # PPO Actor Critic Continuous
                dist = model.dist_from_params(mu, std)
                action_tensor = dist.sample()
                log_prob_tensor = dist.log_prob(action_tensor).sum(dim=-1)
                action = action_tensor.squeeze(0).cpu().numpy()
                log_prob = float(log_prob_tensor.item())
            else:
                raise ValueError(f"Unknown control_type {control_type}")

            values.append(float(value.squeeze(0).item()))
            actions.append(action)
            log_probs.append(log_prob)

            # Environment step
            obs, reward, done, truncated, info = env.step(action)
            rewards.append(float(reward))
            dones.append(bool(done))

            current_return += float(reward)
            steps_total += 1

            # Episode boundary
            if done or truncated:
                episode_returns.append(current_return)
                current_return = 0.0
                obs, info = env.reset()
                done = False
                truncated = False

            if steps_total >= total_env_steps:
                break

        # Bootstrap value for last state if not terminal
        if done or truncated:
            last_value = 0.0
        else:
            obs_tensor = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                if control_type == "discrete":
                    _, v_last = model(obs_tensor)
                else:
                    _, _, v_last = model(obs_tensor)
            last_value = float(v_last.squeeze(0).item())

        # Prepare tensors for PPO update
        states_tensor = torch.tensor(np.asarray(states), dtype=torch.float32, device=device)
        if control_type == "discrete":
            actions_tensor = torch.tensor(actions, dtype=torch.long, device=device)
        else:
            actions_tensor = torch.tensor(np.asarray(actions), dtype=torch.float32, device=device)

        values_tensor = torch.tensor(np.asarray(values), dtype=torch.float32, device=device)
        logp_old_tensor = torch.tensor(np.asarray(log_probs), dtype=torch.float32, device=device)

        returns_tensor, advantages_tensor = compute_returns_and_advantages(
            rewards=np.asarray(rewards, dtype=np.float32),
            dones=np.asarray(dones, dtype=np.bool_),
            values=values_tensor,
            last_value=last_value,
            gamma=gamma,
            device=device,
        )

        stats = ppo_update(
            model=model,
            optimizer=optimizer,
            s=states_tensor,
            a=actions_tensor,
            logp_old=logp_old_tensor,
            returns=returns_tensor,
            advantages=advantages_tensor,
            control_type=control_type,
            cfg=ppo_cfg,
        )

        # Logging
        if iteration == 1 or (steps_total - last_log_steps) >= log_every or steps_total >= total_env_steps:
            if episode_returns:
                avg10 = float(np.mean(episode_returns[-10:]))
            else:
                avg10 = float("nan")
            print(
                f"[PPO] it={iteration:>5} steps={steps_total:>8} avg10={avg10:7.2f} "
                f"loss={stats['loss']:.3f} pg={stats['policy_loss']:.3f} "
                f"vf={stats['value_loss']:.3f} H={stats['entropy']:.3f} "
                f"KL={stats['approx_kl']:.4f} clip_frac={stats['clip_fraction']:.3f}",
                flush=True,
            )
            last_log_steps = steps_total

    env.close()
    wall_time = time.time() - wall_start
    if episode_returns:
        avg10 = float(np.mean(episode_returns[-10:]))
    else:
        avg10 = float("nan")
    print(f"[PPO] done steps={steps_total} time={wall_time:.1f}s avg10={avg10:.2f}")

    logs: Dict[str, Any] = {
        "episode_returns": episode_returns,
        "steps_total": steps_total,
        "wall_time_sec": wall_time,
        "train_config": {
            "env_id": env_id,
            "total_env_steps": total_env_steps,
            "rollout_len": rollout_len,
            "gamma": gamma,
            "ppo_cfg": ppo_cfg.__dict__,
            "lr": lr,
            "log_every": log_every,
            "seed": SEED,
        },
    }
    return model, episode_returns, logs

7. Greedy evaluation helper¶

This section defines a helper function that evaluates a trained PPO actor–critic model in greedy mode on a given environment. The function creates a fresh evaluation environment, runs a fixed number of episodes, selects actions using the model’s greedy policy, and records the total return for each episode. The same helper works for discrete and continuous control by switching on a control type argument and reuses the greedy action methods of the actor–critic networks. The resulting list of episode returns is used to summarise performance.

In [ ]:
from typing import Literal, List, Tuple, Dict, Optional
import csv
import os

def evaluate_greedy(
    env_id: str,
    model: nn.Module,
    control_type: Literal["discrete", "continuous"],
    n_episodes: int,
    max_steps: int | None = None,
    base_seed: int = SEED,
    csv_path: Optional[str] = None,
) -> List[float]:

    model.eval()
    returns: List[float] = []
    rows: List[Dict[str, float | int]] = []

    for ep in range(n_episodes):
        seed = base_seed + ep

        # fresh eval env for this episode with its own seed
        env = make_eval_env(env_id, base_seed=seed)

        obs, info = env.reset()
        done = False
        truncated = False
        ep_return = 0.0
        steps = 0

        while not (done or truncated):
            if control_type == "discrete":
                action = model.greedy_action(obs)
            elif control_type == "continuous":
                action = model.greedy_action(obs)
            else:
                raise ValueError(f"Unknown control_type {control_type}")

            obs, reward, done, truncated, info = env.step(action)
            ep_return += float(reward)
            steps += 1

            if max_steps is not None and steps >= max_steps:
                break

        env.close()

        returns.append(ep_return)
        print(
            f"Eval episode {ep + 1} seed {seed} "
            f"return {ep_return:.2f} steps {steps}"
        )

        rows.append(
            {
                "episode": ep + 1,
                "seed": seed,
                "return": ep_return,
                "steps": steps,
            }
        )

    mean_ret = float(np.mean(returns)) if len(returns) > 0 else float("nan")
    std_ret = float(np.std(returns)) if len(returns) > 0 else float("nan")
    print(f"Greedy evaluation mean {mean_ret:.2f}  std {std_ret:.2f}")


    if csv_path is not None:
        os.makedirs(os.path.dirname(csv_path), exist_ok=True)
        with open(csv_path, "w", newline="") as f:
            writer = csv.DictWriter(
                f, fieldnames=["episode", "seed", "return", "steps"]
            )
            writer.writeheader()
            writer.writerows(rows)
        print(f"Saved greedy eval log to {csv_path}")

    return returns

8. Video recording helper¶

This section defines a helper function that records one greedy evaluation episode for a trained PPO model. The function creates a video environment that writes frames into the videos subfolder of the corresponding run directory, runs a single episode using the model’s greedy policy, and then closes the environment. The same logic works for discrete and continuous control by switching on a control type argument. The function returns the total reward and the number of steps for the recorded episode.

In [ ]:
from typing import Literal, Tuple

def record_one_greedy_episode(
    env_id: str,
    model: nn.Module,
    control_type: Literal["discrete", "continuous"],
    run_dir: str,
    max_steps: int | None = None,
    seed: int | None = None,     # <--- NEW
) -> Tuple[float, int]:

    model.eval()

    video_dir = os.path.join(run_dir, "videos")
    base_seed = SEED if seed is None else seed

    env = make_video_env(env_id=env_id, video_dir=video_dir, base_seed=base_seed)

    obs, info = env.reset()
    done = False
    truncated = False
    ep_return = 0.0
    steps = 0

    while not (done or truncated):
        if control_type == "discrete":
            action = model.greedy_action(obs)
        elif control_type == "continuous":
            action = model.greedy_action(obs)
        else:
            raise ValueError(f"Unknown control_type {control_type}")

        obs, reward, done, truncated, info = env.step(action)
        ep_return += float(reward)
        steps += 1

        if max_steps is not None and steps >= max_steps:
            break

    env.close()
    print(
        f"Recorded greedy PPO episode return {ep_return:.2f} "
        f"steps {steps} with seed {base_seed} into {video_dir}"
    )
    return ep_return, steps


    import csv
import numpy as np



def record_best_greedy_from_csv(
    env_id: str,
    model: nn.Module,
    control_type: Literal["discrete", "continuous"],
    run_dir: str,
    csv_path: str,
    max_steps: int | None = None,
):

    # Load CSV and find best row
    best_row = None
    best_return = -np.inf

    with open(csv_path, "r", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            r = float(row["return"])
            if r > best_return:
                best_return = r
                best_row = row

    if best_row is None:
        print(f"No rows found in {csv_path}, nothing to record.")
        return

    best_seed = int(best_row["seed"])
    best_ep = int(best_row["episode"])
    best_steps = int(best_row["steps"])

    print(
        f"Best eval episode from CSV: ep={best_ep}, "
        f"seed={best_seed}, return={best_return:.2f}, steps={best_steps}"
    )

    # Record video using that seed
    rec_ret, rec_steps = record_one_greedy_episode(
        env_id=env_id,
        model=model,
        control_type=control_type,
        run_dir=run_dir,
        max_steps=max_steps,
        seed=best_seed,
    )

    print(
        f"Replayed best episode for video: return={rec_ret:.2f}, steps={rec_steps}"
    )

9. PPO on LunarLander: training and evaluation¶

This section applies the PPO implementation to the LunarLander environment, which has a discrete action space. A discrete PPO actor–critic network is constructed from the observation dimension and number of actions. The training loop runs for a fixed number of environment steps and logs episode returns. After training, episode returns are plotted, evaluation episodes are run in greedy mode, and a separate figure is saved for the evaluation returns. The trained model weights and evaluation statistics are stored, together with one recorded greedy evaluation episode.

Run#1

In [ ]:
# Discover LunarLander dimensions
tmp_env = make_env(LUNAR_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_lunar = tmp_env.observation_space.shape[0]
n_actions_lunar = tmp_env.action_space.n
tmp_env.close()

# PPO model configuration for LunarLander
lunar_cfg = PPODiscreteModelConfig(
    obs_dim=obs_dim_lunar,
    n_actions=n_actions_lunar,
    hidden_sizes=(256, 256),
)

lunar_model = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

# PPO hyperparameters for LunarLander
lunar_ppo_cfg = PPOUpdateConfig(
    clip_range=0.2,
    value_coef=0.5,
    entropy_coef=0.01,
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run directory for this LunarLander PPO run
lunar_run_name = "run_1_lunar_ppo"
lunar_run_dir = make_run_dir(LUNAR_ROOT, lunar_run_name)
print(f"LunarLander PPO run dir: {lunar_run_dir}")

# Training budget
lunar_total_steps = 200_000
lunar_rollout_len = 2048

# Train PPO on LunarLander
lunar_model, lunar_episode_returns, lunar_logs = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model,
    control_type="discrete",
    run_dir=lunar_run_dir,
    total_env_steps=lunar_total_steps,
    rollout_len=lunar_rollout_len,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg,
    lr=3e-4,
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns, dtype=np.float32),
)

lunar_model_path = os.path.join(lunar_run_dir, "ppo_lunar_model.pth")
torch.save(lunar_model.state_dict(), lunar_model_path)
print(f"Saved LunarLander PPO model to {lunar_model_path}")

# Training curve
plot_rewards(
    rewards=lunar_episode_returns,
    run_dir=lunar_run_dir,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns",
    ma_window=20,
)

# Greedy evaluation (10 episodes) with CSV logging
csv_path = os.path.join(lunar_run_dir, "ppo_lunar_eval_log.csv")

lunar_eval_returns = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,         # so seeds are reproducible & logged
    csv_path=csv_path,      # where to write the per-episode log
)

# Save eval returns as .npy as well (for consistency with other runs)
np.save(
    os.path.join(lunar_run_dir, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns, dtype=np.float32),
)

# Eval plot
plot_eval_returns(
    returns=lunar_eval_returns,
    run_dir=lunar_run_dir,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns",
)

# Record video of the *best* greedy evaluation episode using the CSV
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model,
    control_type="discrete",
    run_dir=lunar_run_dir,
    csv_path=csv_path,
    max_steps=1000,
)
LunarLander PPO run dir: a3_bonus_ppo_artifacts/lunar_lander/run_1_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-199.08 loss=3983.179 pg=-0.003 vf=7966.391 H=1.381 KL=0.0104 clip_frac=0.055
[PPO] it=    6 steps=   12288 avg10=-133.17 loss=514.058 pg=0.007 vf=1028.128 H=1.372 KL=0.0154 clip_frac=0.124
[PPO] it=   11 steps=   22528 avg10= -98.66 loss=427.852 pg=0.003 vf=855.726 H=1.362 KL=0.0167 clip_frac=0.164
[PPO] it=   16 steps=   32768 avg10=-125.56 loss=179.989 pg=-0.001 vf=360.006 H=1.279 KL=0.0201 clip_frac=0.186
[PPO] it=   21 steps=   43008 avg10=-178.81 loss=447.380 pg=0.007 vf=894.772 H=1.249 KL=0.0137 clip_frac=0.107
[PPO] it=   26 steps=   53248 avg10=-131.08 loss=119.240 pg=0.010 vf=238.484 H=1.242 KL=0.0268 clip_frac=0.280
[PPO] it=   31 steps=   63488 avg10=-167.43 loss=481.312 pg=-0.006 vf=962.661 H=1.180 KL=0.0192 clip_frac=0.129
[PPO] it=   36 steps=   73728 avg10=-231.79 loss=625.078 pg=-0.002 vf=1250.182 H=1.097 KL=0.0200 clip_frac=0.176
[PPO] it=   41 steps=   83968 avg10=-125.97 loss=331.642 pg=0.009 vf=663.289 H=1.176 KL=0.0245 clip_frac=0.181
[PPO] it=   46 steps=   94208 avg10= -80.68 loss=63.778 pg=0.002 vf=127.574 H=1.148 KL=0.0219 clip_frac=0.236
[PPO] it=   51 steps=  104448 avg10= -75.58 loss=84.260 pg=-0.005 vf=168.553 H=1.169 KL=0.0161 clip_frac=0.188
[PPO] it=   56 steps=  114688 avg10= -52.83 loss=341.454 pg=0.005 vf=682.921 H=1.125 KL=0.0140 clip_frac=0.118
[PPO] it=   61 steps=  124928 avg10=-117.57 loss=559.202 pg=0.002 vf=1118.419 H=0.952 KL=0.0195 clip_frac=0.140
[PPO] it=   66 steps=  135168 avg10=-121.85 loss=477.379 pg=0.005 vf=954.765 H=0.953 KL=0.0163 clip_frac=0.125
[PPO] it=   71 steps=  145408 avg10=-223.03 loss=362.928 pg=0.003 vf=725.870 H=0.977 KL=0.0173 clip_frac=0.104
[PPO] it=   76 steps=  155648 avg10=-236.17 loss=1151.527 pg=0.001 vf=2303.072 H=0.975 KL=0.0132 clip_frac=0.081
[PPO] it=   81 steps=  165888 avg10=-259.33 loss=601.277 pg=-0.001 vf=1202.574 H=0.955 KL=0.0173 clip_frac=0.110
[PPO] it=   86 steps=  176128 avg10=-398.91 loss=813.671 pg=0.004 vf=1627.349 H=0.778 KL=0.0221 clip_frac=0.215
[PPO] it=   91 steps=  186368 avg10=-420.18 loss=625.665 pg=0.004 vf=1251.337 H=0.766 KL=0.0197 clip_frac=0.165
[PPO] it=   96 steps=  196608 avg10=-461.94 loss=665.878 pg=0.006 vf=1331.760 H=0.739 KL=0.0422 clip_frac=0.179
[PPO] it=   98 steps=  200000 avg10=-548.98 loss=968.237 pg=0.013 vf=1936.457 H=0.404 KL=0.0734 clip_frac=0.168
[PPO] done steps=200000 time=159.7s avg10=-548.98
Saved LunarLander PPO model to a3_bonus_ppo_artifacts/lunar_lander/run_1_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_1_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -647.78 steps 105
Eval episode 2 seed 1228 return -452.15 steps 74
Eval episode 3 seed 1229 return -771.80 steps 130
Eval episode 4 seed 1230 return -658.71 steps 121
Eval episode 5 seed 1231 return -600.53 steps 92
Eval episode 6 seed 1232 return -643.94 steps 163
Eval episode 7 seed 1233 return -605.75 steps 108
Eval episode 8 seed 1234 return -439.29 steps 71
Eval episode 9 seed 1235 return -586.09 steps 108
Eval episode 10 seed 1236 return -688.59 steps 95
Greedy evaluation mean -609.46  std 95.94
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_1_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_1_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=8, seed=1234, return=-439.29, steps=71
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_1_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
/usr/local/lib/python3.12/dist-packages/moviepy/config_defaults.py:47: SyntaxWarning: invalid escape sequence '\P'
  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
Recorded greedy PPO episode return -439.29 steps 71 with seed 1234 into a3_bonus_ppo_artifacts/lunar_lander/run_1_lunar_ppo/videos
Replayed best episode for video: return=-439.29, steps=71

Run#2

In [ ]:
# Fresh PPO model for run 2 (reuse same lunar_cfg)
lunar_model_run2 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

# PPO hyperparameters for LunarLander (run 2)
lunar_ppo_cfg_run2 = PPOUpdateConfig(
    clip_range=0.2,
    value_coef=0.5,
    entropy_coef=0.01,
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run directory for this LunarLander PPO run
lunar_run_name_run2 = "run_2_lunar_ppo"
lunar_run_dir_run2 = make_run_dir(LUNAR_ROOT, lunar_run_name_run2)
print(f"LunarLander PPO run 2 dir: {lunar_run_dir_run2}")

# Training budget (slightly longer than run 1)
lunar_total_steps_run2 = 250_000
lunar_rollout_len_run2 = 2048

# Train PPO on LunarLander (run 2)
lunar_model_run2, lunar_episode_returns_run2, lunar_logs_run2 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run2,
    control_type="discrete",
    run_dir=lunar_run_dir_run2,
    total_env_steps=lunar_total_steps_run2,
    rollout_len=lunar_rollout_len_run2,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run2,
    lr=3e-4,
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir_run2, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run2, dtype=np.float32),
)

lunar_model_path_run2 = os.path.join(lunar_run_dir_run2, "ppo_lunar_model.pth")
torch.save(lunar_model_run2.state_dict(), lunar_model_path_run2)
print(f"Saved LunarLander PPO run 2 model to {lunar_model_path_run2}")

# Training curve
plot_rewards(
    rewards=lunar_episode_returns_run2,
    run_dir=lunar_run_dir_run2,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 2)",
    ma_window=20,
)

# Greedy evaluation (10 episodes) with CSV logging
csv_path_run2 = os.path.join(lunar_run_dir_run2, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run2 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run2,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,          # same pattern as run 1
    csv_path=csv_path_run2,  # per-episode log for this run
)

# Save eval returns as .npy
np.save(
    os.path.join(lunar_run_dir_run2, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run2, dtype=np.float32),
)

# Eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run2,
    run_dir=lunar_run_dir_run2,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 2)",
)

# Record video of the *best* greedy evaluation episode (run 2)
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run2,
    control_type="discrete",
    run_dir=lunar_run_dir_run2,
    csv_path=csv_path_run2,
    max_steps=1000,
)
LunarLander PPO run 2 dir: a3_bonus_ppo_artifacts/lunar_lander/run_2_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-186.69 loss=3911.622 pg=-0.011 vf=7823.293 H=1.376 KL=0.0144 clip_frac=0.148
[PPO] it=    6 steps=   12288 avg10=-197.85 loss=812.248 pg=0.010 vf=1624.499 H=1.094 KL=0.0167 clip_frac=0.158
[PPO] it=   11 steps=   22528 avg10=-154.07 loss=239.881 pg=-0.001 vf=479.778 H=0.792 KL=0.0199 clip_frac=0.149
[PPO] it=   16 steps=   32768 avg10=-134.54 loss=139.349 pg=0.001 vf=278.713 H=0.873 KL=0.0126 clip_frac=0.081
[PPO] it=   21 steps=   43008 avg10=-131.41 loss=141.085 pg=0.001 vf=282.182 H=0.763 KL=0.0250 clip_frac=0.160
[PPO] it=   26 steps=   53248 avg10=-119.25 loss=31.133 pg=0.009 vf=62.264 H=0.736 KL=0.0243 clip_frac=0.149
[PPO] it=   31 steps=   63488 avg10=-113.71 loss=268.508 pg=0.002 vf=537.019 H=0.342 KL=0.0082 clip_frac=0.042
[PPO] it=   36 steps=   73728 avg10=-114.14 loss=27.311 pg=0.004 vf=54.623 H=0.374 KL=0.0148 clip_frac=0.046
[PPO] it=   41 steps=   83968 avg10=-120.11 loss=21.000 pg=0.004 vf=41.998 H=0.259 KL=0.0080 clip_frac=0.036
[PPO] it=   46 steps=   94208 avg10=-106.40 loss=297.300 pg=0.000 vf=594.604 H=0.228 KL=0.0059 clip_frac=0.022
[PPO] it=   51 steps=  104448 avg10=-115.69 loss=105.548 pg=0.003 vf=211.094 H=0.186 KL=0.0095 clip_frac=0.036
[PPO] it=   56 steps=  114688 avg10=-112.49 loss=148.909 pg=0.000 vf=297.820 H=0.146 KL=0.0054 clip_frac=0.018
[PPO] it=   61 steps=  124928 avg10=-131.11 loss=248.688 pg=-0.000 vf=497.378 H=0.123 KL=0.0033 clip_frac=0.010
[PPO] it=   66 steps=  135168 avg10=-137.07 loss=93.037 pg=-0.002 vf=186.082 H=0.156 KL=0.0115 clip_frac=0.062
[PPO] it=   71 steps=  145408 avg10=-143.19 loss=16.825 pg=0.002 vf=33.647 H=0.051 KL=0.0073 clip_frac=0.008
[PPO] it=   76 steps=  155648 avg10=-134.40 loss=26.647 pg=0.000 vf=53.294 H=0.014 KL=0.0023 clip_frac=0.001
[PPO] it=   81 steps=  165888 avg10=-148.28 loss=204.160 pg=0.000 vf=408.320 H=0.006 KL=0.0002 clip_frac=0.000
[PPO] it=   86 steps=  176128 avg10=-124.82 loss=270.844 pg=-0.000 vf=541.687 H=0.007 KL=0.0004 clip_frac=0.001
[PPO] it=   91 steps=  186368 avg10=-146.28 loss=129.921 pg=0.016 vf=259.813 H=0.108 KL=0.0840 clip_frac=0.096
[PPO] it=   96 steps=  196608 avg10=-128.24 loss=195.777 pg=0.000 vf=391.555 H=0.013 KL=0.0006 clip_frac=0.000
[PPO] it=  101 steps=  206848 avg10=-141.45 loss=362.222 pg=0.000 vf=724.443 H=0.005 KL=0.0003 clip_frac=0.000
[PPO] it=  106 steps=  217088 avg10=-137.00 loss=163.329 pg=-0.000 vf=326.659 H=0.007 KL=0.0004 clip_frac=0.000
[PPO] it=  111 steps=  227328 avg10=-131.50 loss=80.213 pg=-0.000 vf=160.426 H=0.003 KL=0.0003 clip_frac=0.000
[PPO] it=  116 steps=  237568 avg10=-134.74 loss=45.286 pg=-0.021 vf=90.616 H=0.112 KL=0.0481 clip_frac=0.077
[PPO] it=  121 steps=  247808 avg10=-168.15 loss=173.927 pg=0.012 vf=347.834 H=0.170 KL=0.0161 clip_frac=0.057
[PPO] it=  123 steps=  250000 avg10=-156.31 loss=2.721 pg=-0.005 vf=5.452 H=0.028 KL=0.0423 clip_frac=0.023
[PPO] done steps=250000 time=208.2s avg10=-156.31
Saved LunarLander PPO run 2 model to a3_bonus_ppo_artifacts/lunar_lander/run_2_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_2_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -129.17 steps 65
Eval episode 2 seed 1228 return -159.52 steps 59
Eval episode 3 seed 1229 return -145.76 steps 68
Eval episode 4 seed 1230 return -116.81 steps 62
Eval episode 5 seed 1231 return -138.79 steps 75
Eval episode 6 seed 1232 return -98.14 steps 57
Eval episode 7 seed 1233 return -157.32 steps 78
Eval episode 8 seed 1234 return -174.10 steps 63
Eval episode 9 seed 1235 return -111.18 steps 53
Eval episode 10 seed 1236 return -144.49 steps 85
Greedy evaluation mean -137.53  std 22.53
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_2_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_2_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=6, seed=1232, return=-98.14, steps=57
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_2_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -98.14 steps 57 with seed 1232 into a3_bonus_ppo_artifacts/lunar_lander/run_2_lunar_ppo/videos
Replayed best episode for video: return=-98.14, steps=57

Run#3

In [ ]:
# Fresh PPO model for run 3 (same architecture as before)
lunar_model_run3 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)


# Slightly lower entropy than run 1/2 and a bit more training
lunar_ppo_cfg_run3 = PPOUpdateConfig(
    clip_range=0.2,
    value_coef=0.5,
    entropy_coef=0.008,   # less exploration noise than 0.01
    max_grad_norm=0.5,
    n_epochs=6,           # a bit stronger update than run 1 (4 epochs)
    batch_size=64,
    normalize_adv=True,
)

# Run directory for this LunarLander PPO run
lunar_run_name_run3 = "run_3_lunar_ppo"
lunar_run_dir_run3 = make_run_dir(LUNAR_ROOT, lunar_run_name_run3)
print(f"LunarLander PPO run dir: {lunar_run_dir_run3}")

# Training budget (longer than run 1, similar or a bit more than run 2)
lunar_total_steps_run3 = 400_000
lunar_rollout_len_run3 = 2048

# Train PPO on LunarLander (run 3)
lunar_model_run3, lunar_episode_returns_run3, lunar_logs_run3 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run3,
    control_type="discrete",
    run_dir=lunar_run_dir_run3,
    total_env_steps=lunar_total_steps_run3,
    rollout_len=lunar_rollout_len_run3,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run3,
    lr=3e-4,
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir_run3, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run3, dtype=np.float32),
)

lunar_model_path_run3 = os.path.join(lunar_run_dir_run3, "ppo_lunar_model.pth")
torch.save(lunar_model_run3.state_dict(), lunar_model_path_run3)
print(f"Saved LunarLander PPO model to {lunar_model_path_run3}")

# Training curve for run 3
plot_rewards(
    rewards=lunar_episode_returns_run3,
    run_dir=lunar_run_dir_run3,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 3)",
    ma_window=20,
)

# Greedy evaluation with seed logging (run 3)
csv_path_run3 = os.path.join(lunar_run_dir_run3, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run3 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run3,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_run3,
)

# Save eval returns for run 3
np.save(
    os.path.join(lunar_run_dir_run3, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run3, dtype=np.float32),
)

# Eval plot for run 3
plot_eval_returns(
    returns=lunar_eval_returns_run3,
    run_dir=lunar_run_dir_run3,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 3)",
)

# Record best episode as video for run 3
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run3,
    control_type="discrete",
    run_dir=lunar_run_dir_run3,
    csv_path=csv_path_run3,
    max_steps=1000,
)
LunarLander PPO run dir: a3_bonus_ppo_artifacts/lunar_lander/run_3_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-149.78 loss=1911.891 pg=-0.000 vf=3823.805 H=1.383 KL=0.0083 clip_frac=0.037
[PPO] it=    6 steps=   12288 avg10=-143.59 loss=457.121 pg=0.001 vf=914.262 H=1.354 KL=0.0167 clip_frac=0.123
[PPO] it=   11 steps=   22528 avg10=-183.20 loss=168.599 pg=0.002 vf=337.215 H=1.325 KL=0.0189 clip_frac=0.176
[PPO] it=   16 steps=   32768 avg10=-158.91 loss=328.653 pg=0.004 vf=657.319 H=1.320 KL=0.0220 clip_frac=0.250
[PPO] it=   21 steps=   43008 avg10=-126.43 loss=360.168 pg=-0.008 vf=720.371 H=1.294 KL=0.0167 clip_frac=0.198
[PPO] it=   26 steps=   53248 avg10= -62.87 loss=38.718 pg=0.005 vf=77.446 H=1.277 KL=0.0216 clip_frac=0.225
[PPO] it=   31 steps=   63488 avg10= -95.12 loss=155.756 pg=-0.000 vf=311.530 H=1.138 KL=0.0203 clip_frac=0.207
[PPO] it=   36 steps=   73728 avg10=-202.66 loss=249.063 pg=0.002 vf=498.136 H=0.953 KL=0.0212 clip_frac=0.205
[PPO] it=   41 steps=   83968 avg10=-440.64 loss=964.793 pg=0.006 vf=1929.588 H=0.873 KL=0.0196 clip_frac=0.176
[PPO] it=   46 steps=   94208 avg10=-1443.35 loss=2582.284 pg=0.016 vf=5164.547 H=0.704 KL=0.0329 clip_frac=0.149
[PPO] it=   51 steps=  104448 avg10=-851.13 loss=5293.773 pg=0.002 vf=10587.545 H=0.137 KL=0.0093 clip_frac=0.017
[PPO] it=   56 steps=  114688 avg10=-898.44 loss=1165.533 pg=0.005 vf=2331.058 H=0.101 KL=0.0263 clip_frac=0.034
[PPO] it=   61 steps=  124928 avg10=-765.82 loss=213.658 pg=-0.000 vf=427.316 H=0.003 KL=0.0000 clip_frac=0.000
[PPO] it=   66 steps=  135168 avg10=-653.71 loss=452.120 pg=-0.000 vf=904.239 H=0.002 KL=0.0000 clip_frac=0.000
[PPO] it=   71 steps=  145408 avg10=-814.52 loss=374.739 pg=-0.000 vf=749.479 H=0.003 KL=0.0003 clip_frac=0.000
[PPO] it=   76 steps=  155648 avg10=-1024.10 loss=716.886 pg=-0.000 vf=1433.773 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=   81 steps=  165888 avg10=-883.67 loss=127.411 pg=0.000 vf=254.822 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=   86 steps=  176128 avg10=-687.01 loss=187.602 pg=0.000 vf=375.204 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=   91 steps=  186368 avg10=-724.83 loss=52.128 pg=0.000 vf=104.257 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=   96 steps=  196608 avg10=-600.37 loss=966.967 pg=0.000 vf=1933.935 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  101 steps=  206848 avg10=-991.36 loss=188.054 pg=-0.000 vf=376.109 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  106 steps=  217088 avg10=-866.56 loss=502.515 pg=0.000 vf=1005.030 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  111 steps=  227328 avg10=-708.58 loss=167.219 pg=-0.000 vf=334.438 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  116 steps=  237568 avg10=-755.79 loss=296.122 pg=-0.000 vf=592.245 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  121 steps=  247808 avg10=-833.96 loss=339.741 pg=0.000 vf=679.482 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  126 steps=  258048 avg10=-876.27 loss=232.792 pg=-0.000 vf=465.585 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  131 steps=  268288 avg10=-943.78 loss=1324.268 pg=0.000 vf=2648.536 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  136 steps=  278528 avg10=-717.06 loss=163.192 pg=-0.000 vf=326.384 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  141 steps=  288768 avg10=-1064.40 loss=609.796 pg=-0.000 vf=1219.592 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  146 steps=  299008 avg10=-827.13 loss=64.799 pg=0.000 vf=129.598 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  151 steps=  309248 avg10=-681.16 loss=157.403 pg=0.000 vf=314.805 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  156 steps=  319488 avg10=-1387.99 loss=2394.573 pg=0.000 vf=4789.146 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  161 steps=  329728 avg10=-656.19 loss=502.784 pg=0.000 vf=1005.569 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  166 steps=  339968 avg10=-891.77 loss=216.951 pg=0.000 vf=433.902 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  171 steps=  350208 avg10=-719.98 loss=53.326 pg=-0.000 vf=106.651 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  176 steps=  360448 avg10=-675.63 loss=105.941 pg=-0.000 vf=211.882 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  181 steps=  370688 avg10=-645.05 loss=50.773 pg=0.000 vf=101.546 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  186 steps=  380928 avg10=-772.60 loss=163.614 pg=0.000 vf=327.228 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  191 steps=  391168 avg10=-649.40 loss=61.151 pg=0.000 vf=122.301 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] it=  196 steps=  400000 avg10=-884.99 loss=256.687 pg=0.000 vf=513.373 H=0.000 KL=0.0000 clip_frac=0.000
[PPO] done steps=400000 time=415.9s avg10=-884.99
Saved LunarLander PPO model to a3_bonus_ppo_artifacts/lunar_lander/run_3_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_3_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -647.78 steps 105
Eval episode 2 seed 1228 return -452.15 steps 74
Eval episode 3 seed 1229 return -771.80 steps 130
Eval episode 4 seed 1230 return -723.20 steps 119
Eval episode 5 seed 1231 return -600.53 steps 92
Eval episode 6 seed 1232 return -854.45 steps 158
Eval episode 7 seed 1233 return -721.47 steps 110
Eval episode 8 seed 1234 return -439.29 steps 71
Eval episode 9 seed 1235 return -586.09 steps 108
Eval episode 10 seed 1236 return -688.59 steps 95
Greedy evaluation mean -648.54  std 125.96
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_3_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_3_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=8, seed=1234, return=-439.29, steps=71
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_3_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -439.29 steps 71 with seed 1234 into a3_bonus_ppo_artifacts/lunar_lander/run_3_lunar_ppo/videos
Replayed best episode for video: return=-439.29, steps=71

Run#4

In [ ]:
# Fresh model for run 4
lunar_model_run4 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

# PPO hyperparameters for run 4
lunar_ppo_cfg_run4 = PPOUpdateConfig(
    clip_range=0.2,
    value_coef=0.5,
    entropy_coef=0.006,   # a bit lower exploration noise than run 3
    max_grad_norm=0.5,
    n_epochs=5,           # slightly stronger updates
    batch_size=64,
    normalize_adv=True,
)

# Run directory for run 4
lunar_run_name_run4 = "run_4_lunar_ppo"
lunar_run_dir_run4 = make_run_dir(LUNAR_ROOT, lunar_run_name_run4)
print(f"LunarLander PPO run 4 dir: {lunar_run_dir_run4}")

# Training budget for run 4
lunar_total_steps_run4 = 400_000      # longer than run 1,2,3
lunar_rollout_len_run4 = 2048

# Train PPO on LunarLander (run 4)
lunar_model_run4, lunar_episode_returns_run4, lunar_logs_run4 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run4,
    control_type="discrete",
    run_dir=lunar_run_dir_run4,
    total_env_steps=lunar_total_steps_run4,
    rollout_len=lunar_rollout_len_run4,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run4,
    lr=2.5e-4,
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir_run4, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run4, dtype=np.float32),
)

lunar_model_path_run4 = os.path.join(lunar_run_dir_run4, "ppo_lunar_model.pth")
torch.save(lunar_model_run4.state_dict(), lunar_model_path_run4)
print(f"Saved LunarLander PPO run 4 model to {lunar_model_path_run4}")

# Training curve (run 4)
plot_rewards(
    rewards=lunar_episode_returns_run4,
    run_dir=lunar_run_dir_run4,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 4)",
    ma_window=20,
)

# Greedy evaluation with seed logging (run 4)
csv_path_run4 = os.path.join(lunar_run_dir_run4, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run4 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run4,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_run4,
)

# Save eval returns as .npy
np.save(
    os.path.join(lunar_run_dir_run4, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run4, dtype=np.float32),
)

# Eval plot (run 4)
plot_eval_returns(
    returns=lunar_eval_returns_run4,
    run_dir=lunar_run_dir_run4,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 4)",
)

# Record video of the best greedy episode for run 4
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run4,
    control_type="discrete",
    run_dir=lunar_run_dir_run4,
    csv_path=csv_path_run4,
    max_steps=1000,
)
LunarLander PPO run 4 dir: a3_bonus_ppo_artifacts/lunar_lander/run_4_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-202.13 loss=2861.126 pg=-0.003 vf=5722.275 H=1.382 KL=0.0079 clip_frac=0.029
[PPO] it=   11 steps=   22528 avg10=-137.70 loss=479.405 pg=0.002 vf=958.822 H=1.357 KL=0.0168 clip_frac=0.191
[PPO] it=   21 steps=   43008 avg10=-131.58 loss=228.743 pg=-0.001 vf=457.504 H=1.345 KL=0.0172 clip_frac=0.181
[PPO] it=   31 steps=   63488 avg10= -95.66 loss=144.174 pg=0.010 vf=288.343 H=1.233 KL=0.0165 clip_frac=0.138
[PPO] it=   41 steps=   83968 avg10= -72.74 loss=50.893 pg=0.005 vf=101.792 H=1.173 KL=0.0163 clip_frac=0.187
[PPO] it=   51 steps=  104448 avg10=-103.26 loss=89.806 pg=0.005 vf=179.615 H=1.171 KL=0.0190 clip_frac=0.161
[PPO] it=   61 steps=  124928 avg10=-116.29 loss=178.811 pg=0.001 vf=357.633 H=1.065 KL=0.0121 clip_frac=0.131
[PPO] it=   71 steps=  145408 avg10=-104.01 loss=93.293 pg=0.005 vf=186.589 H=1.062 KL=0.0189 clip_frac=0.146
[PPO] it=   81 steps=  165888 avg10= -75.45 loss=73.676 pg=0.005 vf=147.354 H=1.097 KL=0.0171 clip_frac=0.145
[PPO] it=   91 steps=  186368 avg10= -96.32 loss=373.907 pg=0.002 vf=747.821 H=1.060 KL=0.0095 clip_frac=0.052
[PPO] it=  101 steps=  206848 avg10= -21.84 loss=54.164 pg=0.003 vf=108.335 H=1.096 KL=0.0150 clip_frac=0.136
[PPO] it=  111 steps=  227328 avg10=-172.97 loss=329.834 pg=0.001 vf=659.679 H=1.062 KL=0.0223 clip_frac=0.229
[PPO] it=  121 steps=  247808 avg10=-221.51 loss=197.438 pg=0.005 vf=394.876 H=0.867 KL=0.0238 clip_frac=0.196
[PPO] it=  131 steps=  268288 avg10= -23.09 loss=40.899 pg=0.003 vf=81.807 H=1.083 KL=0.0137 clip_frac=0.122
[PPO] it=  141 steps=  288768 avg10= -55.24 loss=102.878 pg=0.004 vf=205.761 H=1.072 KL=0.0195 clip_frac=0.159
[PPO] it=  151 steps=  309248 avg10= -19.20 loss=101.434 pg=0.001 vf=202.880 H=1.108 KL=0.0135 clip_frac=0.121
[PPO] it=  161 steps=  329728 avg10=  -6.22 loss=111.441 pg=0.001 vf=222.893 H=1.076 KL=0.0141 clip_frac=0.132
[PPO] it=  171 steps=  350208 avg10=  -0.49 loss=27.062 pg=0.002 vf=54.134 H=1.172 KL=0.0203 clip_frac=0.209
[PPO] it=  181 steps=  370688 avg10=   9.54 loss=101.555 pg=-0.000 vf=203.125 H=1.189 KL=0.0153 clip_frac=0.134
[PPO] it=  191 steps=  391168 avg10= -11.40 loss=30.659 pg=0.004 vf=61.325 H=1.181 KL=0.0177 clip_frac=0.181
[PPO] it=  196 steps=  400000 avg10=  41.89 loss=241.301 pg=0.007 vf=482.600 H=1.120 KL=0.0360 clip_frac=0.373
[PPO] done steps=400000 time=411.9s avg10=41.89
Saved LunarLander PPO run 4 model to a3_bonus_ppo_artifacts/lunar_lander/run_4_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_4_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return 223.64 steps 369
Eval episode 2 seed 1228 return 285.11 steps 360
Eval episode 3 seed 1229 return 106.97 steps 1000
Eval episode 4 seed 1230 return -33.58 steps 425
Eval episode 5 seed 1231 return 224.26 steps 463
Eval episode 6 seed 1232 return 224.38 steps 365
Eval episode 7 seed 1233 return 222.32 steps 487
Eval episode 8 seed 1234 return 287.90 steps 341
Eval episode 9 seed 1235 return 16.40 steps 278
Eval episode 10 seed 1236 return 260.41 steps 358
Greedy evaluation mean 181.78  std 106.99
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_4_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_4_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=8, seed=1234, return=287.90, steps=341
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_4_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 287.90 steps 341 with seed 1234 into a3_bonus_ppo_artifacts/lunar_lander/run_4_lunar_ppo/videos
Replayed best episode for video: return=287.90, steps=341

Run#5

In [ ]:
# fresh PPO model for run 5
lunar_model_run5 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

# PPO hyperparameters for run 5
lunar_ppo_cfg_run5 = PPOUpdateConfig(
    clip_range=0.20,
    value_coef=0.5,
    entropy_coef=0.008,   # a bit less exploration than 0.01
    max_grad_norm=0.5,
    n_epochs=4,           # keep moderate updates per batch
    batch_size=64,
    normalize_adv=True,
)

# run-5 directory
lunar_run_name_run5 = "run_5_lunar_ppo"
lunar_run_dir_run5 = make_run_dir(LUNAR_ROOT, lunar_run_name_run5)
print(f"LunarLander PPO run 5 dir: {lunar_run_dir_run5}")

# training budget for run 5 (longer than run 1–4)
lunar_total_steps_run5 = 400_000
lunar_rollout_len_run5 = 2048

# Train PPO on LunarLander (run 5)
lunar_model_run5, lunar_episode_returns_run5, lunar_logs_run5 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run5,
    control_type="discrete",
    run_dir=lunar_run_dir_run5,
    total_env_steps=lunar_total_steps_run5,
    rollout_len=lunar_rollout_len_run5,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run5,
    lr=2.5e-4,        # a bit smaller than 3e-4 for stability
    log_every=20_000,
)

# save training returns and model
np.save(
    os.path.join(lunar_run_dir_run5, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run5, dtype=np.float32),
)

lunar_model_path_run5 = os.path.join(lunar_run_dir_run5, "ppo_lunar_model.pth")
torch.save(lunar_model_run5.state_dict(), lunar_model_path_run5)
print(f"Saved LunarLander PPO run 5 model to {lunar_model_path_run5}")

# training curve
plot_rewards(
    rewards=lunar_episode_returns_run5,
    run_dir=lunar_run_dir_run5,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 5)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (run 5)
csv_path_run5 = os.path.join(lunar_run_dir_run5, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run5 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run5,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,          # reproducible seeds
    csv_path=csv_path_run5,  # log each episode: ep, seed, return, steps
)

# save eval returns
np.save(
    os.path.join(lunar_run_dir_run5, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run5, dtype=np.float32),
)

# eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run5,
    run_dir=lunar_run_dir_run5,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 5)",
)

# Record video for the best eval episode (run 5)
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run5,
    control_type="discrete",
    run_dir=lunar_run_dir_run5,
    csv_path=csv_path_run5,
    max_steps=1000,
)
LunarLander PPO run 5 dir: a3_bonus_ppo_artifacts/lunar_lander/run_5_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-198.29 loss=4092.692 pg=-0.003 vf=8185.413 H=1.381 KL=0.0095 clip_frac=0.044
[PPO] it=   11 steps=   22528 avg10=-146.68 loss=355.569 pg=0.002 vf=711.154 H=1.219 KL=0.0158 clip_frac=0.109
[PPO] it=   21 steps=   43008 avg10=-118.85 loss=285.014 pg=0.002 vf=570.042 H=1.141 KL=0.0176 clip_frac=0.135
[PPO] it=   31 steps=   63488 avg10=-103.45 loss=130.185 pg=-0.002 vf=260.389 H=1.046 KL=0.0187 clip_frac=0.151
[PPO] it=   41 steps=   83968 avg10=-163.98 loss=507.808 pg=-0.003 vf=1015.641 H=1.259 KL=0.0150 clip_frac=0.087
[PPO] it=   51 steps=  104448 avg10= -80.85 loss=454.245 pg=0.001 vf=908.507 H=1.217 KL=0.0116 clip_frac=0.058
[PPO] it=   61 steps=  124928 avg10=-240.90 loss=610.307 pg=0.000 vf=1220.629 H=1.047 KL=0.0169 clip_frac=0.122
[PPO] it=   71 steps=  145408 avg10=-157.59 loss=623.915 pg=-0.002 vf=1247.850 H=1.053 KL=0.0149 clip_frac=0.139
[PPO] it=   81 steps=  165888 avg10= -76.68 loss=423.825 pg=0.010 vf=847.650 H=1.182 KL=0.0229 clip_frac=0.258
[PPO] it=   91 steps=  186368 avg10=-120.33 loss=365.690 pg=0.004 vf=731.390 H=1.101 KL=0.0172 clip_frac=0.137
[PPO] it=  101 steps=  206848 avg10=-149.51 loss=375.892 pg=0.007 vf=751.786 H=1.073 KL=0.0182 clip_frac=0.187
[PPO] it=  111 steps=  227328 avg10=-281.07 loss=100.237 pg=0.009 vf=200.474 H=1.079 KL=0.0432 clip_frac=0.340
[PPO] it=  121 steps=  247808 avg10=-579.48 loss=219.167 pg=0.008 vf=438.333 H=0.900 KL=0.0271 clip_frac=0.214
[PPO] it=  131 steps=  268288 avg10=-334.10 loss=126.839 pg=0.003 vf=253.685 H=0.785 KL=0.0312 clip_frac=0.254
[PPO] it=  141 steps=  288768 avg10=-111.84 loss=156.717 pg=0.001 vf=313.446 H=0.933 KL=0.0174 clip_frac=0.189
[PPO] it=  151 steps=  309248 avg10=-237.80 loss=199.959 pg=0.007 vf=399.920 H=0.979 KL=0.0218 clip_frac=0.186
[PPO] it=  161 steps=  329728 avg10= -32.56 loss=248.046 pg=0.000 vf=496.105 H=0.898 KL=0.0136 clip_frac=0.140
[PPO] it=  171 steps=  350208 avg10= -64.95 loss=94.193 pg=0.005 vf=188.390 H=0.900 KL=0.0231 clip_frac=0.196
[PPO] it=  181 steps=  370688 avg10=-150.44 loss=68.070 pg=0.011 vf=136.133 H=0.900 KL=0.0241 clip_frac=0.273
[PPO] it=  191 steps=  391168 avg10=-168.98 loss=85.677 pg=-0.003 vf=171.374 H=0.936 KL=0.0331 clip_frac=0.290
[PPO] it=  196 steps=  400000 avg10=-225.67 loss=89.697 pg=-0.002 vf=179.412 H=0.915 KL=0.0128 clip_frac=0.107
[PPO] done steps=400000 time=417.5s avg10=-225.67
Saved LunarLander PPO run 5 model to a3_bonus_ppo_artifacts/lunar_lander/run_5_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_5_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -447.18 steps 243
Eval episode 2 seed 1228 return -404.89 steps 305
Eval episode 3 seed 1229 return -428.75 steps 253
Eval episode 4 seed 1230 return -431.27 steps 216
Eval episode 5 seed 1231 return -369.30 steps 162
Eval episode 6 seed 1232 return -413.95 steps 239
Eval episode 7 seed 1233 return -391.90 steps 181
Eval episode 8 seed 1234 return -363.06 steps 292
Eval episode 9 seed 1235 return -406.66 steps 267
Eval episode 10 seed 1236 return -360.73 steps 256
Greedy evaluation mean -401.77  std 28.59
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_5_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_5_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=10, seed=1236, return=-360.73, steps=256
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_5_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -360.73 steps 256 with seed 1236 into a3_bonus_ppo_artifacts/lunar_lander/run_5_lunar_ppo/videos
Replayed best episode for video: return=-360.73, steps=256

Run#6

In [ ]:
# Fresh PPO model for run 6 (no weight sharing with earlier runs)
lunar_model_run6 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)


lunar_ppo_cfg_run6 = PPOUpdateConfig(
    clip_range=0.18,    # a bit more conservative than 0.20
    value_coef=0.5,
    entropy_coef=0.004, # less exploration than run 5
    max_grad_norm=0.5,
    n_epochs=5,         # moderate number of epochs
    batch_size=64,
    normalize_adv=True,
)

# Run directory for this LunarLander PPO run
lunar_run_name_run6 = "run_6_lunar_ppo"
lunar_run_dir_run6 = make_run_dir(LUNAR_ROOT, lunar_run_name_run6)
print(f"LunarLander PPO run 6 dir: {lunar_run_dir_run6}")

# Training budget
lunar_total_steps_run6 = 450_000     # a bit more than earlier 400k-style runs
lunar_rollout_len_run6 = 2048

# Train PPO on LunarLander (run 6)
lunar_model_run6, lunar_episode_returns_run6, lunar_logs_run6 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run6,
    control_type="discrete",
    run_dir=lunar_run_dir_run6,
    total_env_steps=lunar_total_steps_run6,
    rollout_len=lunar_rollout_len_run6,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run6,
    lr=2.0e-4,          # slightly smaller LR than run 5 for smoother updates
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir_run6, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run6, dtype=np.float32),
)

lunar_model_path_run6 = os.path.join(lunar_run_dir_run6, "ppo_lunar_model.pth")
torch.save(lunar_model_run6.state_dict(), lunar_model_path_run6)
print(f"Saved LunarLander PPO run 6 model to {lunar_model_path_run6}")

# Training curve
plot_rewards(
    rewards=lunar_episode_returns_run6,
    run_dir=lunar_run_dir_run6,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 6)",
    ma_window=20,
)

# Greedy evaluation (10 episodes) with CSV logging
csv_path_run6 = os.path.join(lunar_run_dir_run6, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run6 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run6,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_run6 # log per-episode seed/return/steps
)

# Save eval returns as .npy
np.save(
    os.path.join(lunar_run_dir_run6, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run6, dtype=np.float32),
)

# Eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run6,
    run_dir=lunar_run_dir_run6,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 6)",
)

# Record video of the *best* greedy evaluation episode using the CSV
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run6,
    control_type="discrete",
    run_dir=lunar_run_dir_run6,
    csv_path=csv_path_run6,
    max_steps=1000,
)
LunarLander PPO run 6 dir: a3_bonus_ppo_artifacts/lunar_lander/run_6_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-196.92 loss=2499.671 pg=-0.004 vf=4999.361 H=1.382 KL=0.0107 clip_frac=0.045
[PPO] it=   11 steps=   22528 avg10=-118.00 loss=255.331 pg=0.001 vf=510.669 H=1.190 KL=0.0136 clip_frac=0.119
[PPO] it=   21 steps=   43008 avg10=-112.54 loss=262.227 pg=-0.005 vf=524.475 H=1.291 KL=0.0134 clip_frac=0.156
[PPO] it=   31 steps=   63488 avg10= -48.74 loss=275.011 pg=0.002 vf=550.029 H=1.197 KL=0.0089 clip_frac=0.082
[PPO] it=   41 steps=   83968 avg10=-118.79 loss=339.200 pg=-0.000 vf=678.409 H=1.098 KL=0.0105 clip_frac=0.077
[PPO] it=   51 steps=  104448 avg10=-105.46 loss=293.111 pg=0.004 vf=586.223 H=1.207 KL=0.0085 clip_frac=0.072
[PPO] it=   61 steps=  124928 avg10=-151.84 loss=699.198 pg=-0.002 vf=1398.407 H=1.094 KL=0.0126 clip_frac=0.091
[PPO] it=   71 steps=  145408 avg10= -29.09 loss=334.385 pg=0.001 vf=668.777 H=1.143 KL=0.0116 clip_frac=0.087
[PPO] it=   81 steps=  165888 avg10= -51.91 loss=281.474 pg=0.000 vf=562.955 H=0.985 KL=0.0108 clip_frac=0.065
[PPO] it=   91 steps=  186368 avg10= -84.25 loss=370.169 pg=0.003 vf=740.341 H=1.032 KL=0.0123 clip_frac=0.086
[PPO] it=  101 steps=  206848 avg10= -63.36 loss=121.355 pg=0.003 vf=242.712 H=1.065 KL=0.0122 clip_frac=0.109
[PPO] it=  111 steps=  227328 avg10=   2.33 loss=40.886 pg=-0.000 vf=81.780 H=1.065 KL=0.0131 clip_frac=0.149
[PPO] it=  121 steps=  247808 avg10=  -5.35 loss=139.201 pg=0.003 vf=278.405 H=1.014 KL=0.0113 clip_frac=0.101
[PPO] it=  131 steps=  268288 avg10=   1.35 loss=259.382 pg=0.001 vf=518.769 H=0.841 KL=0.0103 clip_frac=0.069
[PPO] it=  141 steps=  288768 avg10=   6.76 loss=289.571 pg=0.001 vf=579.147 H=0.790 KL=0.0106 clip_frac=0.082
[PPO] it=  151 steps=  309248 avg10= -39.10 loss=285.819 pg=0.001 vf=571.643 H=0.855 KL=0.0137 clip_frac=0.123
[PPO] it=  161 steps=  329728 avg10= -52.63 loss=244.773 pg=0.000 vf=489.553 H=0.968 KL=0.0125 clip_frac=0.131
[PPO] it=  171 steps=  350208 avg10=  40.82 loss=97.516 pg=0.003 vf=195.034 H=1.018 KL=0.0175 clip_frac=0.192
[PPO] it=  181 steps=  370688 avg10=  33.54 loss=65.310 pg=0.001 vf=130.626 H=1.075 KL=0.0142 clip_frac=0.174
[PPO] it=  191 steps=  391168 avg10=  99.46 loss=12.100 pg=0.011 vf=24.187 H=1.030 KL=0.0241 clip_frac=0.243
[PPO] it=  201 steps=  411648 avg10=  94.15 loss=18.878 pg=0.008 vf=37.749 H=1.070 KL=0.0182 clip_frac=0.190
[PPO] it=  211 steps=  432128 avg10=  70.95 loss=174.764 pg=0.001 vf=349.533 H=0.932 KL=0.0101 clip_frac=0.062
[PPO] it=  220 steps=  450000 avg10=  13.21 loss=18.550 pg=0.002 vf=37.103 H=0.918 KL=0.0182 clip_frac=0.190
[PPO] done steps=450000 time=506.9s avg10=13.21
Saved LunarLander PPO run 6 model to a3_bonus_ppo_artifacts/lunar_lander/run_6_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_6_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -176.35 steps 1000
Eval episode 2 seed 1228 return -137.12 steps 1000
Eval episode 3 seed 1229 return -163.15 steps 1000
Eval episode 4 seed 1230 return -190.27 steps 1000
Eval episode 5 seed 1231 return -170.93 steps 1000
Eval episode 6 seed 1232 return -196.87 steps 1000
Eval episode 7 seed 1233 return -181.78 steps 1000
Eval episode 8 seed 1234 return -139.57 steps 1000
Eval episode 9 seed 1235 return -163.95 steps 1000
Eval episode 10 seed 1236 return -119.46 steps 1000
Greedy evaluation mean -163.94  std 23.65
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_6_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_6_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=10, seed=1236, return=-119.46, steps=1000
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_6_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -119.46 steps 1000 with seed 1236 into a3_bonus_ppo_artifacts/lunar_lander/run_6_lunar_ppo/videos
Replayed best episode for video: return=-119.46, steps=1000

Run#7

In [ ]:
# Fresh model for run 7
lunar_model_run7 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

# PPO hyperparameters for run 7
lunar_ppo_cfg_run7 = PPOUpdateConfig(
    clip_range=0.2,
    value_coef=0.5,
    entropy_coef=0.006,   # a bit less exploration than run 5 (0.008)
    max_grad_norm=0.5,
    n_epochs=6,           # stronger updates than base run (4)
    batch_size=64,
    normalize_adv=True,
)

# Run directory
lunar_run_name_run7 = "run_7_lunar_ppo"
lunar_run_dir_run7 = make_run_dir(LUNAR_ROOT, lunar_run_name_run7)
print(f"LunarLander PPO run 7 dir: {lunar_run_dir_run7}")

# Training budget
lunar_total_steps_run7 = 600_000      # more steps than run 5
lunar_rollout_len_run7 = 2048

# Train PPO
lunar_model_run7, lunar_episode_returns_run7, lunar_logs_run7 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run7,
    control_type="discrete",
    run_dir=lunar_run_dir_run7,
    total_env_steps=lunar_total_steps_run7,
    rollout_len=lunar_rollout_len_run7,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run7,
    lr=2.0e-4,            # slightly smaller LR than run 5 (2.5e-4)
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir_run7, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run7, dtype=np.float32),
)

lunar_model_path_run7 = os.path.join(lunar_run_dir_run7, "ppo_lunar_model.pth")
torch.save(lunar_model_run7.state_dict(), lunar_model_path_run7)
print(f"Saved LunarLander PPO run 7 model to {lunar_model_path_run7}")

# Training curve
plot_rewards(
    rewards=lunar_episode_returns_run7,
    run_dir=lunar_run_dir_run7,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 7)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_run7 = os.path.join(lunar_run_dir_run7, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run7 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run7,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_run7,
)

# Save eval returns and plot
np.save(
    os.path.join(lunar_run_dir_run7, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run7, dtype=np.float32),
)

plot_eval_returns(
    returns=lunar_eval_returns_run7,
    run_dir=lunar_run_dir_run7,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 7)",
)

# Record video of best greedy episode from CSV
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run7,
    control_type="discrete",
    run_dir=lunar_run_dir_run7,
    csv_path=csv_path_run7,
    max_steps=1000,
)
LunarLander PPO run 7 dir: a3_bonus_ppo_artifacts/lunar_lander/run_7_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-148.13 loss=3067.798 pg=-0.001 vf=6135.616 H=1.383 KL=0.0068 clip_frac=0.020
[PPO] it=   11 steps=   22528 avg10=-294.07 loss=1024.483 pg=-0.002 vf=2048.985 H=1.331 KL=0.0134 clip_frac=0.104
[PPO] it=   21 steps=   43008 avg10=-243.23 loss=616.650 pg=-0.001 vf=1233.316 H=1.149 KL=0.0182 clip_frac=0.179
[PPO] it=   31 steps=   63488 avg10= -97.03 loss=222.150 pg=0.001 vf=444.312 H=1.178 KL=0.0171 clip_frac=0.166
[PPO] it=   41 steps=   83968 avg10= -82.66 loss=864.729 pg=0.004 vf=1729.465 H=1.187 KL=0.0146 clip_frac=0.099
[PPO] it=   51 steps=  104448 avg10=-119.70 loss=306.881 pg=-0.003 vf=613.781 H=1.137 KL=0.0172 clip_frac=0.158
[PPO] it=   61 steps=  124928 avg10= -72.78 loss=251.477 pg=0.007 vf=502.954 H=1.164 KL=0.0193 clip_frac=0.229
[PPO] it=   71 steps=  145408 avg10= -41.65 loss=205.642 pg=0.002 vf=411.294 H=1.119 KL=0.0130 clip_frac=0.092
[PPO] it=   81 steps=  165888 avg10=-116.10 loss=192.611 pg=-0.000 vf=385.235 H=1.069 KL=0.0117 clip_frac=0.085
[PPO] it=   91 steps=  186368 avg10= -77.58 loss=69.107 pg=0.002 vf=138.223 H=1.052 KL=0.0100 clip_frac=0.067
[PPO] it=  101 steps=  206848 avg10=-103.15 loss=224.125 pg=-0.000 vf=448.262 H=0.989 KL=0.0113 clip_frac=0.085
[PPO] it=  111 steps=  227328 avg10= -54.56 loss=66.452 pg=0.003 vf=132.909 H=1.021 KL=0.0182 clip_frac=0.148
[PPO] it=  121 steps=  247808 avg10= -28.99 loss=86.032 pg=0.003 vf=172.072 H=1.027 KL=0.0168 clip_frac=0.157
[PPO] it=  131 steps=  268288 avg10=  -5.26 loss=126.373 pg=0.000 vf=252.760 H=1.163 KL=0.0149 clip_frac=0.130
[PPO] it=  141 steps=  288768 avg10=  31.90 loss=42.949 pg=0.000 vf=85.912 H=1.138 KL=0.0187 clip_frac=0.156
[PPO] it=  151 steps=  309248 avg10=  -2.50 loss=78.356 pg=0.001 vf=156.721 H=0.936 KL=0.0176 clip_frac=0.165
[PPO] it=  161 steps=  329728 avg10=  -9.93 loss=106.700 pg=0.002 vf=213.408 H=0.904 KL=0.0098 clip_frac=0.048
[PPO] it=  171 steps=  350208 avg10=  31.39 loss=156.187 pg=-0.001 vf=312.385 H=0.786 KL=0.0114 clip_frac=0.070
[PPO] it=  181 steps=  370688 avg10=  46.35 loss=85.995 pg=-0.001 vf=172.001 H=0.799 KL=0.0108 clip_frac=0.082
[PPO] it=  191 steps=  391168 avg10=   1.38 loss=150.073 pg=-0.002 vf=300.164 H=1.142 KL=0.0124 clip_frac=0.084
[PPO] it=  201 steps=  411648 avg10=  72.53 loss=76.078 pg=0.013 vf=152.141 H=1.075 KL=0.0279 clip_frac=0.254
[PPO] it=  211 steps=  432128 avg10=   9.79 loss=127.183 pg=0.003 vf=254.373 H=1.030 KL=0.0152 clip_frac=0.124
[PPO] it=  221 steps=  452608 avg10=  51.28 loss=76.438 pg=0.004 vf=152.880 H=1.080 KL=0.0133 clip_frac=0.078
[PPO] it=  231 steps=  473088 avg10=  72.53 loss=62.161 pg=0.003 vf=124.329 H=0.999 KL=0.0148 clip_frac=0.110
[PPO] it=  241 steps=  493568 avg10=  57.46 loss=23.316 pg=0.008 vf=46.631 H=1.151 KL=0.0193 clip_frac=0.181
[PPO] it=  251 steps=  514048 avg10=  89.34 loss=39.750 pg=0.007 vf=79.498 H=0.965 KL=0.0185 clip_frac=0.175
[PPO] it=  261 steps=  534528 avg10=  23.43 loss=91.311 pg=-0.005 vf=182.643 H=0.975 KL=0.0124 clip_frac=0.091
[PPO] it=  271 steps=  555008 avg10=  94.28 loss=27.777 pg=0.002 vf=55.563 H=1.086 KL=0.0147 clip_frac=0.122
[PPO] it=  281 steps=  575488 avg10=  99.22 loss=17.382 pg=0.005 vf=34.766 H=0.947 KL=0.0177 clip_frac=0.162
[PPO] it=  291 steps=  595968 avg10= 126.36 loss=96.007 pg=0.002 vf=192.022 H=1.019 KL=0.0227 clip_frac=0.232
[PPO] it=  293 steps=  600000 avg10=  71.30 loss=33.019 pg=0.003 vf=66.045 H=1.021 KL=0.0135 clip_frac=0.105
[PPO] done steps=600000 time=705.4s avg10=71.30
Saved LunarLander PPO run 7 model to a3_bonus_ppo_artifacts/lunar_lander/run_7_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_7_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return 125.24 steps 1000
Eval episode 2 seed 1228 return 170.50 steps 1000
Eval episode 3 seed 1229 return 126.57 steps 1000
Eval episode 4 seed 1230 return 114.14 steps 1000
Eval episode 5 seed 1231 return 146.70 steps 1000
Eval episode 6 seed 1232 return 123.47 steps 1000
Eval episode 7 seed 1233 return 130.74 steps 1000
Eval episode 8 seed 1234 return 171.42 steps 1000
Eval episode 9 seed 1235 return 139.95 steps 1000
Eval episode 10 seed 1236 return 157.13 steps 1000
Greedy evaluation mean 140.58  std 19.18
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_7_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_7_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=8, seed=1234, return=171.42, steps=1000
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_7_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 171.42 steps 1000 with seed 1234 into a3_bonus_ppo_artifacts/lunar_lander/run_7_lunar_ppo/videos
Replayed best episode for video: return=171.42, steps=1000

Run#8

In [ ]:
# fresh model for run 8
lunar_model_run8 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

lunar_ppo_cfg_run8 = PPOUpdateConfig(
    clip_range=0.18,    # slightly smaller than 0.2 → gentler policy updates
    value_coef=0.5,
    entropy_coef=0.003, # a bit less exploration noise than run 7
    max_grad_norm=0.5,
    n_epochs=6,         # keep reasonably strong updates
    batch_size=64,
    normalize_adv=True,
)

# Run directory
lunar_run_name_run8 = "run_8_lunar_ppo"
lunar_run_dir_run8 = make_run_dir(LUNAR_ROOT, lunar_run_name_run8)
print(f"LunarLander PPO run 8 dir: {lunar_run_dir_run8}")

# Training budget
lunar_total_steps_run8 = 800_000   # more steps than run 7
lunar_rollout_len_run8 = 2048

# Train PPO on LunarLander
lunar_model_run8, lunar_episode_returns_run8, lunar_logs_run8 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run8,
    control_type="discrete",
    run_dir=lunar_run_dir_run8,
    total_env_steps=lunar_total_steps_run8,
    rollout_len=lunar_rollout_len_run8,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run8,
    lr=2.5e-4,          # same ballpark as good runs
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir_run8, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run8, dtype=np.float32),
)

lunar_model_path_run8 = os.path.join(lunar_run_dir_run8, "ppo_lunar_model.pth")
torch.save(lunar_model_run8.state_dict(), lunar_model_path_run8)
print(f"Saved LunarLander PPO model to {lunar_model_path_run8}")

# Training curve
plot_rewards(
    rewards=lunar_episode_returns_run8,
    run_dir=lunar_run_dir_run8,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 8)",
    ma_window=20,
)

# Greedy evaluation with seed logging
csv_path_run8 = os.path.join(lunar_run_dir_run8, "ppo_lunar_eval_log.csv")
lunar_eval_returns_run8 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run8,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_run8,
)

# Save eval returns
np.save(
    os.path.join(lunar_run_dir_run8, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run8, dtype=np.float32),
)

# Eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run8,
    run_dir=lunar_run_dir_run8,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 8)",
)

# Record video of the best eval episode
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run8,
    control_type="discrete",
    run_dir=lunar_run_dir_run8,
    csv_path=csv_path_run8,
    max_steps=1000,
)
LunarLander PPO run 8 dir: a3_bonus_ppo_artifacts/lunar_lander/run_8_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-157.84 loss=2026.590 pg=-0.002 vf=4053.192 H=1.381 KL=0.0109 clip_frac=0.087
[PPO] it=   11 steps=   22528 avg10=-125.26 loss=347.334 pg=-0.001 vf=694.679 H=1.273 KL=0.0124 clip_frac=0.096
[PPO] it=   21 steps=   43008 avg10=-105.24 loss=165.094 pg=0.003 vf=330.189 H=1.207 KL=0.0204 clip_frac=0.247
[PPO] it=   31 steps=   63488 avg10= -82.28 loss=58.596 pg=0.004 vf=117.191 H=0.968 KL=0.0195 clip_frac=0.192
[PPO] it=   41 steps=   83968 avg10=-166.35 loss=1353.590 pg=0.005 vf=2707.176 H=0.897 KL=0.0148 clip_frac=0.094
[PPO] it=   51 steps=  104448 avg10=-441.34 loss=602.526 pg=0.009 vf=1205.038 H=0.683 KL=0.0221 clip_frac=0.168
[PPO] it=   61 steps=  124928 avg10=-262.53 loss=379.510 pg=0.016 vf=758.993 H=0.819 KL=0.0456 clip_frac=0.236
[PPO] it=   71 steps=  145408 avg10=-283.31 loss=118.736 pg=0.012 vf=237.451 H=0.517 KL=0.0355 clip_frac=0.213
[PPO] it=   81 steps=  165888 avg10=-170.68 loss=210.621 pg=0.000 vf=421.244 H=0.498 KL=0.0189 clip_frac=0.170
[PPO] it=   91 steps=  186368 avg10=-226.98 loss=199.528 pg=0.006 vf=399.048 H=0.695 KL=0.0184 clip_frac=0.165
[PPO] it=  101 steps=  206848 avg10=-233.73 loss=631.259 pg=0.004 vf=1262.512 H=0.516 KL=0.0119 clip_frac=0.115
[PPO] it=  111 steps=  227328 avg10=-180.88 loss=308.309 pg=0.007 vf=616.607 H=0.566 KL=0.0145 clip_frac=0.180
[PPO] it=  121 steps=  247808 avg10=-314.19 loss=180.941 pg=0.006 vf=361.873 H=0.532 KL=0.0232 clip_frac=0.215
[PPO] it=  131 steps=  268288 avg10=-260.37 loss=284.628 pg=0.005 vf=569.249 H=0.446 KL=0.0195 clip_frac=0.177
[PPO] it=  141 steps=  288768 avg10=-232.23 loss=301.353 pg=0.005 vf=602.698 H=0.481 KL=0.0165 clip_frac=0.161
[PPO] it=  151 steps=  309248 avg10=-228.60 loss=213.435 pg=0.005 vf=426.862 H=0.472 KL=0.0149 clip_frac=0.160
[PPO] it=  161 steps=  329728 avg10=-260.93 loss=524.869 pg=0.002 vf=1049.737 H=0.419 KL=0.0101 clip_frac=0.080
[PPO] it=  171 steps=  350208 avg10=-176.96 loss=260.004 pg=0.003 vf=520.005 H=0.505 KL=0.0119 clip_frac=0.119
[PPO] it=  181 steps=  370688 avg10=-283.81 loss=832.659 pg=0.004 vf=1665.312 H=0.416 KL=0.0128 clip_frac=0.092
[PPO] it=  191 steps=  391168 avg10=-157.52 loss=877.589 pg=0.005 vf=1755.170 H=0.331 KL=0.0243 clip_frac=0.129
[PPO] it=  201 steps=  411648 avg10= -95.38 loss=288.671 pg=0.002 vf=577.340 H=0.264 KL=0.0186 clip_frac=0.124
[PPO] it=  211 steps=  432128 avg10=-156.55 loss=1122.172 pg=0.029 vf=2244.288 H=0.445 KL=0.1960 clip_frac=0.244
[PPO] it=  221 steps=  452608 avg10=-143.74 loss=688.388 pg=0.000 vf=1376.778 H=0.365 KL=0.0102 clip_frac=0.078
[PPO] it=  231 steps=  473088 avg10=-126.32 loss=382.098 pg=0.010 vf=764.178 H=0.312 KL=0.0206 clip_frac=0.149
[PPO] it=  241 steps=  493568 avg10=-358.92 loss=1550.937 pg=0.016 vf=3101.845 H=0.338 KL=0.0327 clip_frac=0.218
[PPO] it=  251 steps=  514048 avg10= -88.93 loss=504.996 pg=0.008 vf=1009.976 H=0.272 KL=0.0183 clip_frac=0.117
[PPO] it=  261 steps=  534528 avg10=-205.15 loss=859.445 pg=0.009 vf=1718.874 H=0.265 KL=0.0348 clip_frac=0.167
[PPO] it=  271 steps=  555008 avg10=-320.29 loss=188.879 pg=0.023 vf=377.713 H=0.268 KL=0.0560 clip_frac=0.205
[PPO] it=  281 steps=  575488 avg10=-391.14 loss=169.752 pg=0.005 vf=339.496 H=0.347 KL=0.0177 clip_frac=0.135
[PPO] it=  291 steps=  595968 avg10=-360.37 loss=228.697 pg=0.005 vf=457.387 H=0.301 KL=0.0154 clip_frac=0.113
[PPO] it=  301 steps=  616448 avg10=-356.75 loss=402.375 pg=0.009 vf=804.734 H=0.407 KL=0.0157 clip_frac=0.170
[PPO] it=  311 steps=  636928 avg10=-235.67 loss=157.948 pg=0.034 vf=315.830 H=0.238 KL=0.1203 clip_frac=0.202
[PPO] it=  321 steps=  657408 avg10=-213.00 loss=221.395 pg=0.006 vf=442.780 H=0.232 KL=0.0136 clip_frac=0.083
[PPO] it=  331 steps=  677888 avg10=-153.35 loss=324.349 pg=0.007 vf=648.687 H=0.282 KL=0.0160 clip_frac=0.131
[PPO] it=  341 steps=  698368 avg10=-161.05 loss=857.150 pg=0.002 vf=1714.298 H=0.183 KL=0.0107 clip_frac=0.052
[PPO] it=  351 steps=  718848 avg10=-236.93 loss=243.908 pg=0.010 vf=487.797 H=0.191 KL=0.0213 clip_frac=0.090
[PPO] it=  361 steps=  739328 avg10=-298.88 loss=167.347 pg=0.010 vf=334.675 H=0.279 KL=0.0177 clip_frac=0.147
[PPO] it=  371 steps=  759808 avg10=-356.14 loss=317.747 pg=0.003 vf=635.489 H=0.233 KL=0.0119 clip_frac=0.073
[PPO] it=  381 steps=  780288 avg10=-263.02 loss=320.538 pg=0.005 vf=641.068 H=0.201 KL=0.0124 clip_frac=0.080
[PPO] it=  391 steps=  800000 avg10=-289.29 loss=164.598 pg=0.008 vf=329.181 H=0.214 KL=0.0194 clip_frac=0.108
[PPO] done steps=800000 time=939.0s avg10=-289.29
Saved LunarLander PPO model to a3_bonus_ppo_artifacts/lunar_lander/run_8_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_8_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -282.75 steps 114
Eval episode 2 seed 1228 return -324.28 steps 79
Eval episode 3 seed 1229 return -335.97 steps 106
Eval episode 4 seed 1230 return -612.31 steps 411
Eval episode 5 seed 1231 return -324.25 steps 98
Eval episode 6 seed 1232 return -273.59 steps 177
Eval episode 7 seed 1233 return -265.68 steps 126
Eval episode 8 seed 1234 return -295.61 steps 78
Eval episode 9 seed 1235 return -370.81 steps 115
Eval episode 10 seed 1236 return -278.17 steps 106
Greedy evaluation mean -336.34  std 97.20
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_8_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_8_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=7, seed=1233, return=-265.68, steps=126
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_8_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -265.68 steps 126 with seed 1233 into a3_bonus_ppo_artifacts/lunar_lander/run_8_lunar_ppo/videos
Replayed best episode for video: return=-265.68, steps=126

Run#9

In [ ]:
# fresh network for run 9
lunar_model_run9 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

lunar_ppo_cfg_run9 = PPOUpdateConfig(
    clip_range=0.18,     # a bit smaller than 0.2 → gentler policy updates
    value_coef=0.5,
    entropy_coef=0.004,  # slightly less exploration than run 7
    max_grad_norm=0.5,
    n_epochs=5,          # between earlier 4 and 6 → solid but not too aggressive
    batch_size=128,      # larger batch → smoother gradient estimates
    normalize_adv=True,
)

# Run directory for Run 9
lunar_run_name_run9 = "run_9_lunar_ppo"
lunar_run_dir_run9 = make_run_dir(LUNAR_ROOT, lunar_run_name_run9)
print(f"LunarLander PPO run 9 dir: {lunar_run_dir_run9}")

# Training budget (a bit longer, with longer rollouts)
lunar_total_steps_run9 = 450_000
lunar_rollout_len_run9 = 3072

# Train PPO on LunarLander (Run 9)
lunar_model_run9, lunar_episode_returns_run9, lunar_logs_run9 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run9,
    control_type="discrete",
    run_dir=lunar_run_dir_run9,
    total_env_steps=lunar_total_steps_run9,
    rollout_len=lunar_rollout_len_run9,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run9,
    lr=2.0e-4,           # a bit smaller LR than run 7
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(lunar_run_dir_run9, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run9, dtype=np.float32),
)

lunar_model_path_run9 = os.path.join(lunar_run_dir_run9, "ppo_lunar_model.pth")
torch.save(lunar_model_run9.state_dict(), lunar_model_path_run9)
print(f"Saved LunarLander PPO model to {lunar_model_path_run9}")

# Training curve
plot_rewards(
    rewards=lunar_episode_returns_run9,
    run_dir=lunar_run_dir_run9,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 9)",
    ma_window=20,
)

# Greedy evaluation with per-episode CSV logging
csv_path_run9 = os.path.join(lunar_run_dir_run9, "ppo_lunar_eval_log.csv")
lunar_eval_returns_run9 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run9,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_run9,  # log ep, seed, return, steps
)

# Save eval returns as .npy
np.save(
    os.path.join(lunar_run_dir_run9, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run9, dtype=np.float32),
)

# Eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run9,
    run_dir=lunar_run_dir_run9,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 9)",
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run9,
    control_type="discrete",
    run_dir=lunar_run_dir_run9,
    csv_path=csv_path_run9,
    max_steps=1000,
)
LunarLander PPO run 9 dir: a3_bonus_ppo_artifacts/lunar_lander/run_9_lunar_ppo
[PPO] it=    1 steps=    3072 avg10=-189.53 loss=3675.359 pg=-0.004 vf=7350.736 H=1.383 KL=0.0062 clip_frac=0.035
[PPO] it=    8 steps=   24576 avg10=-128.55 loss=546.037 pg=-0.003 vf=1092.091 H=1.373 KL=0.0110 clip_frac=0.122
[PPO] it=   15 steps=   46080 avg10=-142.60 loss=176.401 pg=-0.000 vf=352.812 H=1.305 KL=0.0100 clip_frac=0.101
[PPO] it=   22 steps=   67584 avg10= -87.89 loss=232.211 pg=-0.001 vf=464.436 H=1.329 KL=0.0108 clip_frac=0.096
[PPO] it=   29 steps=   89088 avg10= -96.46 loss=443.008 pg=0.002 vf=886.023 H=1.251 KL=0.0089 clip_frac=0.064
[PPO] it=   36 steps=  110592 avg10= -86.08 loss=307.777 pg=0.001 vf=615.562 H=1.197 KL=0.0092 clip_frac=0.107
[PPO] it=   43 steps=  132096 avg10= -84.65 loss=209.608 pg=0.001 vf=419.223 H=1.206 KL=0.0124 clip_frac=0.118
[PPO] it=   50 steps=  153600 avg10=-132.22 loss=273.985 pg=0.000 vf=547.979 H=1.184 KL=0.0074 clip_frac=0.051
[PPO] it=   57 steps=  175104 avg10=-160.37 loss=826.809 pg=0.001 vf=1653.626 H=1.097 KL=0.0107 clip_frac=0.087
[PPO] it=   64 steps=  196608 avg10=-179.60 loss=140.510 pg=-0.000 vf=281.031 H=1.149 KL=0.0069 clip_frac=0.062
[PPO] it=   71 steps=  218112 avg10= -45.59 loss=226.316 pg=0.000 vf=452.639 H=1.135 KL=0.0084 clip_frac=0.064
[PPO] it=   78 steps=  239616 avg10= -33.20 loss=103.112 pg=0.003 vf=206.227 H=1.174 KL=0.0073 clip_frac=0.077
[PPO] it=   85 steps=  261120 avg10= -13.66 loss=117.498 pg=-0.002 vf=235.009 H=1.083 KL=0.0078 clip_frac=0.069
[PPO] it=   92 steps=  282624 avg10=  -4.03 loss=120.181 pg=0.002 vf=240.367 H=1.186 KL=0.0084 clip_frac=0.078
[PPO] it=   99 steps=  304128 avg10=  -8.13 loss=49.127 pg=-0.005 vf=98.273 H=1.191 KL=0.0098 clip_frac=0.100
[PPO] it=  106 steps=  325632 avg10=  11.27 loss=62.431 pg=-0.000 vf=124.872 H=1.109 KL=0.0092 clip_frac=0.109
[PPO] it=  113 steps=  347136 avg10=   0.86 loss=68.482 pg=-0.002 vf=136.977 H=1.163 KL=0.0088 clip_frac=0.091
[PPO] it=  120 steps=  368640 avg10=  -4.54 loss=46.163 pg=-0.000 vf=92.336 H=1.127 KL=0.0091 clip_frac=0.106
[PPO] it=  127 steps=  390144 avg10= -14.10 loss=56.053 pg=-0.001 vf=112.116 H=1.109 KL=0.0079 clip_frac=0.072
[PPO] it=  134 steps=  411648 avg10= -15.80 loss=120.497 pg=-0.002 vf=241.007 H=1.123 KL=0.0087 clip_frac=0.063
[PPO] it=  141 steps=  433152 avg10=  -0.60 loss=56.986 pg=0.002 vf=113.977 H=1.153 KL=0.0061 clip_frac=0.036
[PPO] it=  147 steps=  450000 avg10=  -7.37 loss=61.256 pg=-0.002 vf=122.524 H=1.099 KL=0.0082 clip_frac=0.071
[PPO] done steps=450000 time=435.3s avg10=-7.37
Saved LunarLander PPO model to a3_bonus_ppo_artifacts/lunar_lander/run_9_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_9_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -41.11 steps 1000
Eval episode 2 seed 1228 return 17.92 steps 1000
Eval episode 3 seed 1229 return -39.86 steps 1000
Eval episode 4 seed 1230 return -43.46 steps 1000
Eval episode 5 seed 1231 return -18.10 steps 1000
Eval episode 6 seed 1232 return -33.07 steps 1000
Eval episode 7 seed 1233 return -25.81 steps 1000
Eval episode 8 seed 1234 return 13.78 steps 1000
Eval episode 9 seed 1235 return -18.29 steps 1000
Eval episode 10 seed 1236 return -8.25 steps 1000
Greedy evaluation mean -19.63  std 20.78
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_9_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_9_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=2, seed=1228, return=17.92, steps=1000
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_9_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 17.92 steps 1000 with seed 1228 into a3_bonus_ppo_artifacts/lunar_lander/run_9_lunar_ppo/videos
Replayed best episode for video: return=17.92, steps=1000

Run#10

In [ ]:
# fresh model for run 10
lunar_model_run10 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

lunar_ppo_cfg_run10 = PPOUpdateConfig(
    clip_range=0.20,
    value_coef=0.5,
    entropy_coef=0.006,   # a bit less noisy than 0.008 from run 9
    max_grad_norm=0.5,
    n_epochs=5,           # slightly gentler than 6
    batch_size=64,
    normalize_adv=True,
)

lunar_run_name_run10 = "run_10_lunar_ppo"
lunar_run_dir_run10 = make_run_dir(LUNAR_ROOT, lunar_run_name_run10)
print(f"LunarLander PPO run 10 dir: {lunar_run_dir_run10}")

# training budget
lunar_total_steps_run10 = 600_000
lunar_rollout_len_run10 = 2048

# Train PPO
lunar_model_run10, lunar_episode_returns_run10, lunar_logs_run10 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run10,
    control_type="discrete",
    run_dir=lunar_run_dir_run10,
    total_env_steps=lunar_total_steps_run10,
    rollout_len=lunar_rollout_len_run10,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run10,
    lr=2.5e-4,
    log_every=20_000,
)

# save training returns and model
np.save(
    os.path.join(lunar_run_dir_run10, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run10, dtype=np.float32),
)

lunar_model_path_run10 = os.path.join(lunar_run_dir_run10, "ppo_lunar_model.pth")
torch.save(lunar_model_run10.state_dict(), lunar_model_path_run10)
print(f"Saved LunarLander PPO run 10 model to {lunar_model_path_run10}")

# training curve
plot_rewards(
    rewards=lunar_episode_returns_run10,
    run_dir=lunar_run_dir_run10,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 10)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_run10 = os.path.join(lunar_run_dir_run10, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run10 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run10,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_run10,
)

# save eval returns
np.save(
    os.path.join(lunar_run_dir_run10, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run10, dtype=np.float32),
)

# eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run10,
    run_dir=lunar_run_dir_run10,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 10)",
)

# Record best greedy episode as video
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run10,
    control_type="discrete",
    run_dir=lunar_run_dir_run10,
    csv_path=csv_path_run10,
    max_steps=1000,
)
LunarLander PPO run 10 dir: a3_bonus_ppo_artifacts/lunar_lander/run_10_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-153.62 loss=1646.867 pg=-0.000 vf=3293.751 H=1.383 KL=0.0072 clip_frac=0.033
[PPO] it=   11 steps=   22528 avg10=-237.16 loss=982.116 pg=-0.008 vf=1964.262 H=1.205 KL=0.0216 clip_frac=0.209
[PPO] it=   21 steps=   43008 avg10=-114.65 loss=204.577 pg=-0.005 vf=409.179 H=1.221 KL=0.0202 clip_frac=0.180
[PPO] it=   31 steps=   63488 avg10= -92.67 loss=140.522 pg=0.003 vf=281.053 H=1.136 KL=0.0135 clip_frac=0.078
[PPO] it=   41 steps=   83968 avg10= -68.67 loss=163.571 pg=0.002 vf=327.151 H=1.024 KL=0.0120 clip_frac=0.072
[PPO] it=   51 steps=  104448 avg10=  -5.01 loss=56.131 pg=-0.000 vf=112.273 H=1.027 KL=0.0148 clip_frac=0.117
[PPO] it=   61 steps=  124928 avg10=  12.68 loss=38.681 pg=0.002 vf=77.371 H=1.078 KL=0.0134 clip_frac=0.106
[PPO] it=   71 steps=  145408 avg10=   8.84 loss=90.441 pg=-0.000 vf=180.895 H=1.047 KL=0.0133 clip_frac=0.084
[PPO] it=   81 steps=  165888 avg10=   2.70 loss=34.284 pg=0.004 vf=68.573 H=1.151 KL=0.0153 clip_frac=0.098
[PPO] it=   91 steps=  186368 avg10=  23.44 loss=47.162 pg=0.001 vf=94.333 H=1.036 KL=0.0138 clip_frac=0.097
[PPO] it=  101 steps=  206848 avg10=   4.20 loss=180.128 pg=0.000 vf=360.269 H=1.035 KL=0.0131 clip_frac=0.075
[PPO] it=  111 steps=  227328 avg10= -17.72 loss=90.744 pg=0.000 vf=181.501 H=1.080 KL=0.0174 clip_frac=0.130
[PPO] it=  121 steps=  247808 avg10=  37.27 loss=22.975 pg=0.002 vf=45.960 H=1.141 KL=0.0169 clip_frac=0.134
[PPO] it=  131 steps=  268288 avg10=  19.39 loss=89.272 pg=-0.002 vf=178.562 H=1.114 KL=0.0188 clip_frac=0.136
[PPO] it=  141 steps=  288768 avg10=  12.37 loss=48.259 pg=0.001 vf=96.527 H=0.862 KL=0.0165 clip_frac=0.130
[PPO] it=  151 steps=  309248 avg10= -41.03 loss=52.199 pg=0.007 vf=104.396 H=0.937 KL=0.0198 clip_frac=0.147
[PPO] it=  161 steps=  329728 avg10= -12.58 loss=208.195 pg=0.004 vf=416.392 H=0.942 KL=0.0165 clip_frac=0.110
[PPO] it=  171 steps=  350208 avg10= -13.64 loss=298.074 pg=0.006 vf=596.149 H=0.950 KL=0.0139 clip_frac=0.108
[PPO] it=  181 steps=  370688 avg10=  82.25 loss=147.107 pg=0.006 vf=294.210 H=0.693 KL=0.0367 clip_frac=0.189
[PPO] it=  191 steps=  391168 avg10= -11.64 loss=158.656 pg=0.004 vf=317.312 H=0.651 KL=0.0133 clip_frac=0.079
[PPO] it=  201 steps=  411648 avg10= -58.11 loss=280.605 pg=0.008 vf=561.204 H=0.905 KL=0.0209 clip_frac=0.222
[PPO] it=  211 steps=  432128 avg10=  75.71 loss=212.656 pg=0.000 vf=425.321 H=0.805 KL=0.0154 clip_frac=0.115
[PPO] it=  221 steps=  452608 avg10=-121.38 loss=79.551 pg=0.008 vf=159.098 H=0.967 KL=0.0257 clip_frac=0.227
[PPO] it=  231 steps=  473088 avg10= -87.51 loss=277.417 pg=0.005 vf=554.836 H=0.906 KL=0.0235 clip_frac=0.179
[PPO] it=  241 steps=  493568 avg10=   9.08 loss=421.139 pg=0.001 vf=842.285 H=0.760 KL=0.0129 clip_frac=0.083
[PPO] it=  251 steps=  514048 avg10= -11.84 loss=702.870 pg=0.002 vf=1405.747 H=0.803 KL=0.0122 clip_frac=0.082
[PPO] it=  261 steps=  534528 avg10=  75.23 loss=51.203 pg=-0.004 vf=102.424 H=0.831 KL=0.0141 clip_frac=0.102
[PPO] it=  271 steps=  555008 avg10=   9.60 loss=108.662 pg=0.001 vf=217.331 H=0.749 KL=0.0111 clip_frac=0.067
[PPO] it=  281 steps=  575488 avg10=  61.76 loss=108.390 pg=0.001 vf=216.786 H=0.755 KL=0.0137 clip_frac=0.071
[PPO] it=  291 steps=  595968 avg10=  57.73 loss=51.128 pg=0.008 vf=102.249 H=0.819 KL=0.0172 clip_frac=0.125
[PPO] it=  293 steps=  600000 avg10=  38.31 loss=106.424 pg=-0.001 vf=212.858 H=0.670 KL=0.0104 clip_frac=0.052
[PPO] done steps=600000 time=694.2s avg10=38.31
Saved LunarLander PPO run 10 model to a3_bonus_ppo_artifacts/lunar_lander/run_10_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_10_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return 189.28 steps 540
Eval episode 2 seed 1228 return 205.80 steps 797
Eval episode 3 seed 1229 return 179.32 steps 704
Eval episode 4 seed 1230 return 154.66 steps 399
Eval episode 5 seed 1231 return -9.12 steps 248
Eval episode 6 seed 1232 return 172.72 steps 788
Eval episode 7 seed 1233 return -110.31 steps 207
Eval episode 8 seed 1234 return 231.90 steps 679
Eval episode 9 seed 1235 return 190.20 steps 711
Eval episode 10 seed 1236 return 222.43 steps 526
Greedy evaluation mean 142.69  std 105.91
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_10_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_10_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=8, seed=1234, return=231.90, steps=679
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_10_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 231.90 steps 679 with seed 1234 into a3_bonus_ppo_artifacts/lunar_lander/run_10_lunar_ppo/videos
Replayed best episode for video: return=231.90, steps=679

Run#11

In [ ]:
# fresh PPO model for run 11
lunar_model_run11 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

# slightly more conservative but still exploratory config
lunar_ppo_cfg_run11 = PPOUpdateConfig(
    clip_range=0.18,     # a bit smaller than 0.2 → stabler updates
    value_coef=0.5,
    entropy_coef=0.007,  # between 0.008 and 0.006 → some exploration
    max_grad_norm=0.5,
    n_epochs=4,          # less aggressive than 6 → less overfitting per batch
    batch_size=64,
    normalize_adv=True,
)

# run-11 directory
lunar_run_name_run11 = "run_11_lunar_ppo"
lunar_run_dir_run11 = make_run_dir(LUNAR_ROOT, lunar_run_name_run11)
print(f"LunarLander PPO run 11 dir: {lunar_run_dir_run11}")

# a bit more training than run 9, but not huge
lunar_total_steps_run11 = 600_000
lunar_rollout_len_run11 = 2048

# Train PPO on LunarLander (run 11)
lunar_model_run11, lunar_episode_returns_run11, lunar_logs_run11 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run11,
    control_type="discrete",
    run_dir=lunar_run_dir_run11,
    total_env_steps=lunar_total_steps_run11,
    rollout_len=lunar_rollout_len_run11,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run11,
    lr=2.0e-4,        # a bit smaller LR than 2.5e-4 for smoother learning
    log_every=20_000,
)

# save training returns + model
np.save(
    os.path.join(lunar_run_dir_run11, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run11, dtype=np.float32),
)

lunar_model_path_run11 = os.path.join(lunar_run_dir_run11, "ppo_lunar_model.pth")
torch.save(lunar_model_run11.state_dict(), lunar_model_path_run11)
print(f"Saved LunarLander PPO run 11 model to {lunar_model_path_run11}")

# training curve
plot_rewards(
    rewards=lunar_episode_returns_run11,
    run_dir=lunar_run_dir_run11,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 11)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_run11 = os.path.join(lunar_run_dir_run11, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run11 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run11,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,           # seeds 1227.. etc, reproducible
    csv_path=csv_path_run11,  # log ep, seed, return, steps
)

# save eval returns as .npy
np.save(
    os.path.join(lunar_run_dir_run11, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run11, dtype=np.float32),
)

# eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run11,
    run_dir=lunar_run_dir_run11,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 11)",
)

# Record video of best eval episode (by return)
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run11,
    control_type="discrete",
    run_dir=lunar_run_dir_run11,
    csv_path=csv_path_run11,
    max_steps=1000,
)
LunarLander PPO run 11 dir: a3_bonus_ppo_artifacts/lunar_lander/run_11_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-113.89 loss=2794.694 pg=-0.002 vf=5589.411 H=1.385 KL=0.0048 clip_frac=0.019
[PPO] it=   11 steps=   22528 avg10=-144.86 loss=472.695 pg=0.005 vf=945.400 H=1.336 KL=0.0166 clip_frac=0.171
[PPO] it=   21 steps=   43008 avg10=-141.65 loss=259.241 pg=0.002 vf=518.495 H=1.328 KL=0.0130 clip_frac=0.161
[PPO] it=   31 steps=   63488 avg10=-152.01 loss=445.492 pg=0.002 vf=890.999 H=1.353 KL=0.0120 clip_frac=0.063
[PPO] it=   41 steps=   83968 avg10=-117.92 loss=445.997 pg=-0.000 vf=892.013 H=1.336 KL=0.0108 clip_frac=0.099
[PPO] it=   51 steps=  104448 avg10=-114.72 loss=106.575 pg=0.002 vf=213.165 H=1.340 KL=0.0157 clip_frac=0.110
[PPO] it=   61 steps=  124928 avg10= -77.17 loss=38.700 pg=-0.001 vf=77.421 H=1.273 KL=0.0180 clip_frac=0.225
[PPO] it=   71 steps=  145408 avg10= -86.91 loss=115.129 pg=-0.003 vf=230.282 H=1.284 KL=0.0154 clip_frac=0.112
[PPO] it=   81 steps=  165888 avg10= -60.81 loss=186.517 pg=-0.000 vf=373.052 H=1.226 KL=0.0087 clip_frac=0.071
[PPO] it=   91 steps=  186368 avg10= -94.60 loss=255.463 pg=-0.001 vf=510.945 H=1.155 KL=0.0129 clip_frac=0.098
[PPO] it=  101 steps=  206848 avg10= -85.74 loss=334.823 pg=-0.001 vf=669.663 H=1.182 KL=0.0166 clip_frac=0.165
[PPO] it=  111 steps=  227328 avg10= -29.11 loss=172.544 pg=-0.001 vf=345.106 H=1.136 KL=0.0099 clip_frac=0.070
[PPO] it=  121 steps=  247808 avg10= -91.01 loss=579.966 pg=0.002 vf=1159.943 H=1.099 KL=0.0100 clip_frac=0.093
[PPO] it=  131 steps=  268288 avg10= -53.65 loss=148.604 pg=0.002 vf=297.219 H=1.065 KL=0.0150 clip_frac=0.151
[PPO] it=  141 steps=  288768 avg10= -63.37 loss=202.042 pg=0.003 vf=404.094 H=1.079 KL=0.0145 clip_frac=0.150
[PPO] it=  151 steps=  309248 avg10= -33.96 loss=96.118 pg=0.006 vf=192.239 H=0.964 KL=0.0199 clip_frac=0.199
[PPO] it=  161 steps=  329728 avg10= -35.65 loss=59.493 pg=0.000 vf=119.001 H=1.099 KL=0.0109 clip_frac=0.077
[PPO] it=  171 steps=  350208 avg10=   2.40 loss=36.437 pg=0.002 vf=72.886 H=1.134 KL=0.0139 clip_frac=0.126
[PPO] it=  181 steps=  370688 avg10=  -9.37 loss=33.417 pg=-0.001 vf=66.851 H=1.095 KL=0.0144 clip_frac=0.144
[PPO] it=  191 steps=  391168 avg10= -23.86 loss=87.293 pg=0.003 vf=174.595 H=1.147 KL=0.0181 clip_frac=0.165
[PPO] it=  201 steps=  411648 avg10=  -1.15 loss=38.011 pg=0.005 vf=76.029 H=1.162 KL=0.0149 clip_frac=0.120
[PPO] it=  211 steps=  432128 avg10= -48.38 loss=116.198 pg=0.004 vf=232.405 H=1.173 KL=0.0101 clip_frac=0.064
[PPO] it=  221 steps=  452608 avg10= -98.90 loss=872.847 pg=0.006 vf=1745.695 H=0.916 KL=0.0133 clip_frac=0.098
[PPO] it=  231 steps=  473088 avg10= -92.65 loss=256.522 pg=0.002 vf=513.052 H=0.886 KL=0.0112 clip_frac=0.085
[PPO] it=  241 steps=  493568 avg10=-126.81 loss=386.429 pg=0.007 vf=772.856 H=0.907 KL=0.0200 clip_frac=0.180
[PPO] it=  251 steps=  514048 avg10= -19.18 loss=117.192 pg=0.001 vf=234.396 H=0.966 KL=0.0116 clip_frac=0.080
[PPO] it=  261 steps=  534528 avg10= -14.61 loss=173.254 pg=-0.004 vf=346.528 H=0.919 KL=0.0129 clip_frac=0.111
[PPO] it=  271 steps=  555008 avg10= -42.52 loss=433.349 pg=0.009 vf=866.694 H=0.936 KL=0.0126 clip_frac=0.102
[PPO] it=  281 steps=  575488 avg10=  -7.23 loss=105.228 pg=0.003 vf=210.463 H=0.906 KL=0.0125 clip_frac=0.088
[PPO] it=  291 steps=  595968 avg10= -28.00 loss=373.422 pg=0.006 vf=746.843 H=0.839 KL=0.0172 clip_frac=0.117
[PPO] it=  293 steps=  600000 avg10= -45.06 loss=110.734 pg=0.000 vf=221.479 H=0.866 KL=0.0137 clip_frac=0.121
[PPO] done steps=600000 time=598.3s avg10=-45.06
Saved LunarLander PPO run 11 model to a3_bonus_ppo_artifacts/lunar_lander/run_11_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_11_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -95.97 steps 81
Eval episode 2 seed 1228 return -146.63 steps 64
Eval episode 3 seed 1229 return 24.19 steps 99
Eval episode 4 seed 1230 return -114.58 steps 142
Eval episode 5 seed 1231 return -72.54 steps 120
Eval episode 6 seed 1232 return -92.62 steps 105
Eval episode 7 seed 1233 return -53.71 steps 130
Eval episode 8 seed 1234 return -158.18 steps 66
Eval episode 9 seed 1235 return -78.18 steps 82
Eval episode 10 seed 1236 return -119.07 steps 87
Greedy evaluation mean -90.73  std 49.14
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_11_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_11_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=3, seed=1229, return=24.19, steps=99
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_11_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 24.19 steps 99 with seed 1229 into a3_bonus_ppo_artifacts/lunar_lander/run_11_lunar_ppo/videos
Replayed best episode for video: return=24.19, steps=99

Run#12

In [ ]:
# fresh model for run 12
lunar_model_run12 = build_ppo_discrete_model_from_config(lunar_cfg).to(device)

lunar_ppo_cfg_run12 = PPOUpdateConfig(
    clip_range=0.20,
    value_coef=0.5,
    entropy_coef=0.008,   # same exploration level that worked well before
    max_grad_norm=0.5,
    n_epochs=6,
    batch_size=64,
    normalize_adv=True,
)

# run name / directory
lunar_run_name_run12 = "run_12_lunar_ppo"
lunar_run_dir_run12 = make_run_dir(LUNAR_ROOT, lunar_run_name_run12)
print(f"LunarLander PPO run 12 dir: {lunar_run_dir_run12}")

# training budget
lunar_total_steps_run12 = 600_000
lunar_rollout_len_run12 = 2048

# Train PPO
lunar_model_run12, lunar_episode_returns_run12, lunar_logs_run12 = train_ppo_single_env(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run12,
    control_type="discrete",
    run_dir=lunar_run_dir_run12,
    total_env_steps=lunar_total_steps_run12,
    rollout_len=lunar_rollout_len_run12,
    gamma=0.99,
    ppo_cfg=lunar_ppo_cfg_run12,
    lr=2.5e-4,
    log_every=20_000,
)

# save training returns and model
np.save(
    os.path.join(lunar_run_dir_run12, "ppo_lunar_episode_returns.npy"),
    np.array(lunar_episode_returns_run12, dtype=np.float32),
)

lunar_model_path_run12 = os.path.join(lunar_run_dir_run12, "ppo_lunar_model.pth")
torch.save(lunar_model_run12.state_dict(), lunar_model_path_run12)
print(f"Saved LunarLander PPO run 12 model to {lunar_model_path_run12}")

# training curve
plot_rewards(
    rewards=lunar_episode_returns_run12,
    run_dir=lunar_run_dir_run12,
    filename="ppo_lunar_train_rewards.png",
    title="LunarLander - PPO training episode returns (run 12)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_run12 = os.path.join(lunar_run_dir_run12, "ppo_lunar_eval_log.csv")

lunar_eval_returns_run12 = evaluate_greedy(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run12,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,          # reproducible seeds
    csv_path=csv_path_run12, # log seeds/returns/steps
)

# save eval .npy
np.save(
    os.path.join(lunar_run_dir_run12, "ppo_lunar_eval_returns.npy"),
    np.array(lunar_eval_returns_run12, dtype=np.float32),
)

# eval plot
plot_eval_returns(
    returns=lunar_eval_returns_run12,
    run_dir=lunar_run_dir_run12,
    filename="ppo_lunar_eval_rewards.png",
    title="LunarLander - PPO greedy evaluation returns (run 12)",
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=LUNAR_ENV_ID,
    model=lunar_model_run12,
    control_type="discrete",
    run_dir=lunar_run_dir_run12,
    csv_path=csv_path_run12,
    max_steps=1000,
)
LunarLander PPO run 12 dir: a3_bonus_ppo_artifacts/lunar_lander/run_12_lunar_ppo
[PPO] it=    1 steps=    2048 avg10=-161.87 loss=1858.391 pg=-0.003 vf=3716.810 H=1.380 KL=0.0105 clip_frac=0.057
[PPO] it=   11 steps=   22528 avg10=-188.85 loss=313.019 pg=0.000 vf=626.060 H=1.308 KL=0.0160 clip_frac=0.139
[PPO] it=   21 steps=   43008 avg10=-208.30 loss=851.407 pg=-0.000 vf=1702.831 H=1.057 KL=0.0160 clip_frac=0.128
[PPO] it=   31 steps=   63488 avg10= -71.27 loss=52.326 pg=0.003 vf=104.667 H=1.284 KL=0.0246 clip_frac=0.262
[PPO] it=   41 steps=   83968 avg10= -67.37 loss=361.043 pg=0.000 vf=722.104 H=1.153 KL=0.0190 clip_frac=0.181
[PPO] it=   51 steps=  104448 avg10=-117.25 loss=200.266 pg=0.006 vf=400.538 H=1.117 KL=0.0241 clip_frac=0.195
[PPO] it=   61 steps=  124928 avg10=-122.10 loss=372.135 pg=0.011 vf=744.263 H=1.025 KL=0.0216 clip_frac=0.228
[PPO] it=   71 steps=  145408 avg10= -16.55 loss=66.871 pg=0.006 vf=133.747 H=1.028 KL=0.0184 clip_frac=0.173
[PPO] it=   81 steps=  165888 avg10= -67.87 loss=78.594 pg=0.001 vf=157.203 H=1.098 KL=0.0191 clip_frac=0.179
[PPO] it=   91 steps=  186368 avg10= -16.11 loss=100.389 pg=0.001 vf=200.791 H=1.001 KL=0.0171 clip_frac=0.125
[PPO] it=  101 steps=  206848 avg10= -30.60 loss=56.254 pg=0.004 vf=112.516 H=1.093 KL=0.0209 clip_frac=0.214
[PPO] it=  111 steps=  227328 avg10=   6.63 loss=186.491 pg=0.002 vf=372.997 H=1.150 KL=0.0151 clip_frac=0.122
[PPO] it=  121 steps=  247808 avg10=  63.93 loss=95.008 pg=0.003 vf=190.029 H=1.162 KL=0.0212 clip_frac=0.191
[PPO] it=  131 steps=  268288 avg10=  52.59 loss=149.403 pg=-0.000 vf=298.823 H=1.035 KL=0.0122 clip_frac=0.090
[PPO] it=  141 steps=  288768 avg10=  24.18 loss=107.960 pg=0.003 vf=215.932 H=1.097 KL=0.0144 clip_frac=0.114
[PPO] it=  151 steps=  309248 avg10= -13.55 loss=59.601 pg=-0.001 vf=119.218 H=0.861 KL=0.0221 clip_frac=0.218
[PPO] it=  161 steps=  329728 avg10=  -4.71 loss=211.519 pg=0.001 vf=423.050 H=0.877 KL=0.0161 clip_frac=0.121
[PPO] it=  171 steps=  350208 avg10=  77.59 loss=55.985 pg=0.004 vf=111.974 H=0.790 KL=0.0128 clip_frac=0.087
[PPO] it=  181 steps=  370688 avg10= -36.69 loss=54.432 pg=0.009 vf=108.860 H=0.776 KL=0.0191 clip_frac=0.150
[PPO] it=  191 steps=  391168 avg10=  91.89 loss=24.295 pg=0.004 vf=48.598 H=0.920 KL=0.0150 clip_frac=0.136
[PPO] it=  201 steps=  411648 avg10= 101.84 loss=40.391 pg=0.003 vf=80.789 H=0.807 KL=0.0147 clip_frac=0.073
[PPO] it=  211 steps=  432128 avg10=  69.36 loss=16.436 pg=0.005 vf=32.877 H=0.973 KL=0.0167 clip_frac=0.133
[PPO] it=  221 steps=  452608 avg10=  33.15 loss=162.502 pg=0.002 vf=325.014 H=0.928 KL=0.0163 clip_frac=0.111
[PPO] it=  231 steps=  473088 avg10=-162.59 loss=14.728 pg=0.007 vf=29.459 H=1.002 KL=0.0234 clip_frac=0.244
[PPO] it=  241 steps=  493568 avg10=-123.95 loss=74.659 pg=0.003 vf=149.327 H=0.955 KL=0.0139 clip_frac=0.114
[PPO] it=  251 steps=  514048 avg10=  67.11 loss=26.497 pg=0.002 vf=53.007 H=0.945 KL=0.0175 clip_frac=0.133
[PPO] it=  261 steps=  534528 avg10=  78.64 loss=32.682 pg=-0.001 vf=65.384 H=1.107 KL=0.0277 clip_frac=0.226
[PPO] it=  271 steps=  555008 avg10=  24.81 loss=88.647 pg=-0.002 vf=177.313 H=0.984 KL=0.0173 clip_frac=0.110
[PPO] it=  281 steps=  575488 avg10=  56.79 loss=125.009 pg=-0.005 vf=250.038 H=0.727 KL=0.0192 clip_frac=0.108
[PPO] it=  291 steps=  595968 avg10=  60.38 loss=36.133 pg=0.000 vf=72.282 H=1.053 KL=0.0336 clip_frac=0.254
[PPO] it=  293 steps=  600000 avg10=  -2.77 loss=64.472 pg=0.005 vf=128.950 H=0.944 KL=0.0227 clip_frac=0.189
[PPO] done steps=600000 time=704.0s avg10=-2.77
Saved LunarLander PPO run 12 model to a3_bonus_ppo_artifacts/lunar_lander/run_12_lunar_ppo/ppo_lunar_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/lunar_lander/run_12_lunar_ppo/ppo_lunar_train_rewards.png
Eval episode 1 seed 1227 return -276.16 steps 96
Eval episode 2 seed 1228 return -251.67 steps 100
Eval episode 3 seed 1229 return -215.61 steps 108
Eval episode 4 seed 1230 return 153.30 steps 324
Eval episode 5 seed 1231 return -50.98 steps 256
Eval episode 6 seed 1232 return -33.14 steps 221
Eval episode 7 seed 1233 return 179.03 steps 375
Eval episode 8 seed 1234 return -177.05 steps 82
Eval episode 9 seed 1235 return -36.58 steps 166
Eval episode 10 seed 1236 return -251.28 steps 127
Greedy evaluation mean -96.01  std 157.79
Saved greedy eval log to a3_bonus_ppo_artifacts/lunar_lander/run_12_lunar_ppo/ppo_lunar_eval_log.csv
No description has been provided for this image
Saved eval plot to a3_bonus_ppo_artifacts/lunar_lander/run_12_lunar_ppo/ppo_lunar_eval_rewards.png
Best eval episode from CSV: ep=7, seed=1233, return=179.03, steps=375
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/lunar_lander/run_12_lunar_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 179.03 steps 375 with seed 1233 into a3_bonus_ppo_artifacts/lunar_lander/run_12_lunar_ppo/videos
Replayed best episode for video: return=179.03, steps=375

10. PPO on Acrobot: training and evaluation¶

This section applies the PPO implementation to the Acrobot-v1 environment, which has a discrete action space. A discrete PPO actor–critic network is instantiated using the Acrobot observation dimension and number of actions, reusing the same architecture as in the LunarLander runs. The training loop runs for a fixed number of environment steps and logs episode returns to the Acrobot bonus directory. After training, episode returns are plotted, greedy evaluation episodes are run and logged to a CSV file, and a separate figure is saved for the evaluation returns. The trained model weights and evaluation statistics are stored under the Acrobot subdirectory, together with one recorded greedy evaluation episode.

Run#1

In [ ]:
# Discover Acrobot dimensions
tmp_env = make_env(ACROBOT_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_acrobot = tmp_env.observation_space.shape[0]
n_actions_acrobot = tmp_env.action_space.n
tmp_env.close()

print(f"Acrobot obs_dim={obs_dim_acrobot}, n_actions={n_actions_acrobot}")

# PPO model configuration for Acrobot (mirror Lunar architecture)
acrobot_cfg = PPODiscreteModelConfig(
    obs_dim=obs_dim_acrobot,
    n_actions=n_actions_acrobot,
    hidden_sizes=(256, 256),
)

# fresh model for run 1
acrobot_model_run1 = build_ppo_discrete_model_from_config(acrobot_cfg).to(device)

# PPO hyperparameters for Acrobot (start from Lunar run 12 style)
acrobot_ppo_cfg_run1 = PPOUpdateConfig(
    clip_range=0.20,
    value_coef=0.5,
    entropy_coef=0.01,   # same exploration level as good Lunar run
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
acrobot_run_name_run1 = "run_1_acrobot_ppo"
acrobot_run_dir_run1 = make_run_dir(ACROBOT_ROOT, acrobot_run_name_run1)
print(f"Acrobot PPO run 1 dir: {acrobot_run_dir_run1}")

# Training budget
acrobot_total_steps_run1 = 250_000
acrobot_rollout_len_run1 = 2048

# Train PPO on Acrobot
acrobot_model_run1, acrobot_episode_returns_run1, acrobot_logs_run1 = train_ppo_single_env(
    env_id=ACROBOT_ENV_ID,
    model=acrobot_model_run1,
    control_type="discrete",
    run_dir=acrobot_run_dir_run1,
    total_env_steps=acrobot_total_steps_run1,
    rollout_len=acrobot_rollout_len_run1,
    gamma=0.99,
    ppo_cfg=acrobot_ppo_cfg_run1,
    lr=3e-4,
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(acrobot_run_dir_run1, "ppo_acrobot_episode_returns.npy"),
    np.array(acrobot_episode_returns_run1, dtype=np.float32),
)

acrobot_model_path_run1 = os.path.join(acrobot_run_dir_run1, "ppo_acrobot_model.pth")
torch.save(acrobot_model_run1.state_dict(), acrobot_model_path_run1)
print(f"Saved Acrobot PPO run 1 model to {acrobot_model_path_run1}")

# Training curve
plot_rewards(
    rewards=acrobot_episode_returns_run1,
    run_dir=acrobot_run_dir_run1,
    filename="ppo_acrobot_train_rewards.png",
    title="Acrobot PPO training episode returns (run 1)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_acrobot_run1 = os.path.join(acrobot_run_dir_run1, "ppo_acrobot_eval_log.csv")

acrobot_eval_returns_run1 = evaluate_greedy(
    env_id=ACROBOT_ENV_ID,
    model=acrobot_model_run1,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,               # reproducible seeds
    csv_path=csv_path_acrobot_run1,  # log seeds/returns/steps
)

# save eval .npy
np.save(
    os.path.join(acrobot_run_dir_run1, "ppo_acrobot_eval_returns.npy"),
    np.array(acrobot_eval_returns_run1, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=acrobot_eval_returns_run1,
    run_dir=acrobot_run_dir_run1,
    filename="ppo_acrobot_eval_rewards.png",
    title="Acrobot PPO greedy evaluation returns (run 1)",
)

# Record video of the *best* greedy evaluation episode
record_best_greedy_from_csv(
    env_id=ACROBOT_ENV_ID,
    model=acrobot_model_run1,
    control_type="discrete",
    run_dir=acrobot_run_dir_run1,
    csv_path=csv_path_acrobot_run1,
    max_steps=1000,
)
Acrobot obs_dim=6, n_actions=3
Acrobot PPO run 1 dir: a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo
[PPO] it=    1 steps=    2048 avg10=-500.00 loss=1988.976 pg=0.000 vf=3977.973 H=1.095 KL=0.0071 clip_frac=0.041
[PPO] it=    6 steps=   12288 avg10=-500.00 loss=0.289 pg=0.010 vf=0.576 H=0.923 KL=0.0227 clip_frac=0.289
[PPO] it=   11 steps=   22528 avg10=-500.00 loss=0.308 pg=0.008 vf=0.618 H=0.900 KL=0.0204 clip_frac=0.260
[PPO] it=   16 steps=   32768 avg10=-500.00 loss=0.496 pg=0.002 vf=1.010 H=1.049 KL=0.0125 clip_frac=0.139
[PPO] it=   21 steps=   43008 avg10=-500.00 loss=0.351 pg=0.005 vf=0.713 H=1.013 KL=0.0216 clip_frac=0.303
[PPO] it=   26 steps=   53248 avg10=-471.60 loss=56.965 pg=-0.003 vf=113.952 H=0.885 KL=0.0103 clip_frac=0.090
[PPO] it=   31 steps=   63488 avg10=-427.00 loss=49.706 pg=0.000 vf=99.429 H=0.889 KL=0.0068 clip_frac=0.025
[PPO] it=   36 steps=   73728 avg10=-348.10 loss=22.884 pg=0.000 vf=45.784 H=0.816 KL=0.0095 clip_frac=0.070
[PPO] it=   41 steps=   83968 avg10=-288.20 loss=45.798 pg=-0.000 vf=91.610 H=0.689 KL=0.0099 clip_frac=0.044
[PPO] it=   46 steps=   94208 avg10=-279.00 loss=21.395 pg=-0.002 vf=42.809 H=0.690 KL=0.0119 clip_frac=0.107
[PPO] it=   51 steps=  104448 avg10=-235.30 loss=106.716 pg=0.007 vf=213.429 H=0.542 KL=0.0146 clip_frac=0.094
[PPO] it=   56 steps=  114688 avg10=-138.10 loss=29.937 pg=-0.001 vf=59.890 H=0.668 KL=0.0107 clip_frac=0.093
[PPO] it=   61 steps=  124928 avg10=-183.40 loss=19.758 pg=0.002 vf=39.521 H=0.504 KL=0.0088 clip_frac=0.058
[PPO] it=   66 steps=  135168 avg10=-124.10 loss=25.576 pg=0.001 vf=51.160 H=0.535 KL=0.0143 clip_frac=0.111
[PPO] it=   71 steps=  145408 avg10=-121.60 loss=41.031 pg=0.000 vf=82.071 H=0.464 KL=0.0107 clip_frac=0.073
[PPO] it=   76 steps=  155648 avg10=-117.40 loss=20.018 pg=0.005 vf=40.036 H=0.429 KL=0.0117 clip_frac=0.082
[PPO] it=   81 steps=  165888 avg10= -89.70 loss=35.927 pg=0.000 vf=71.860 H=0.319 KL=0.0072 clip_frac=0.034
[PPO] it=   86 steps=  176128 avg10=-100.00 loss=17.430 pg=0.003 vf=34.860 H=0.299 KL=0.0080 clip_frac=0.046
[PPO] it=   91 steps=  186368 avg10= -88.10 loss=54.625 pg=0.000 vf=109.255 H=0.241 KL=0.0046 clip_frac=0.015
[PPO] it=   96 steps=  196608 avg10= -89.20 loss=30.780 pg=0.000 vf=61.565 H=0.256 KL=0.0090 clip_frac=0.038
[PPO] it=  101 steps=  206848 avg10= -98.10 loss=92.398 pg=0.001 vf=184.798 H=0.239 KL=0.0093 clip_frac=0.043
[PPO] it=  106 steps=  217088 avg10=-105.30 loss=43.351 pg=0.000 vf=86.705 H=0.205 KL=0.0048 clip_frac=0.018
[PPO] it=  111 steps=  227328 avg10= -83.40 loss=68.499 pg=0.001 vf=137.002 H=0.200 KL=0.0061 clip_frac=0.018
[PPO] it=  116 steps=  237568 avg10= -84.60 loss=10.485 pg=0.002 vf=20.969 H=0.212 KL=0.0096 clip_frac=0.050
[PPO] it=  121 steps=  247808 avg10= -84.10 loss=13.664 pg=0.000 vf=27.331 H=0.199 KL=0.0105 clip_frac=0.043
[PPO] it=  123 steps=  250000 avg10= -83.70 loss=4.984 pg=-0.033 vf=10.038 H=0.244 KL=0.0068 clip_frac=0.013
[PPO] done steps=250000 time=257.4s avg10=-83.70
Saved Acrobot PPO run 1 model to a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo/ppo_acrobot_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo/ppo_acrobot_train_rewards.png
Eval episode 1 seed 1227 return -64.00 steps 65
Eval episode 2 seed 1228 return -92.00 steps 93
Eval episode 3 seed 1229 return -83.00 steps 84
Eval episode 4 seed 1230 return -64.00 steps 65
Eval episode 5 seed 1231 return -75.00 steps 76
Eval episode 6 seed 1232 return -95.00 steps 96
Eval episode 7 seed 1233 return -118.00 steps 119
Eval episode 8 seed 1234 return -64.00 steps 65
Eval episode 9 seed 1235 return -87.00 steps 88
Eval episode 10 seed 1236 return -92.00 steps 93
Greedy evaluation mean -83.40  std 16.41
Saved greedy eval log to a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo/ppo_acrobot_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo/ppo_acrobot_eval_rewards.png
Best eval episode from CSV: ep=1, seed=1227, return=-64.00, steps=65
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -64.00 steps 65 with seed 1227 into a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo/videos
Replayed best episode for video: return=-64.00, steps=65
In [ ]:
plot_rewards(
    rewards=acrobot_eval_returns_run1,
    run_dir=acrobot_run_dir_run1,
    filename="ppo_acrobot_eval_rewards.png",
    title="Acrobot PPO greedy evaluation returns (run 1)",
    ma_window=3,
)
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/acrobot/run_1_acrobot_ppo/ppo_acrobot_eval_rewards.png

Run#2

In [ ]:
# fresh model for run 2 (reuse the same acrobot_cfg defined above)
acrobot_model_run2 = build_ppo_discrete_model_from_config(acrobot_cfg).to(device)

# PPO hyperparameters for Acrobot – slight variant of run 1
acrobot_ppo_cfg_run2 = PPOUpdateConfig(
    clip_range=0.20,
    value_coef=0.5,
    entropy_coef=0.008,   # a bit lower entropy (more exploitation)
    max_grad_norm=0.5,
    n_epochs=6,           # a bit more epochs per update
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
acrobot_run_name_run2 = "run_2_acrobot_ppo"
acrobot_run_dir_run2 = make_run_dir(ACROBOT_ROOT, acrobot_run_name_run2)
print(f"Acrobot PPO run 2 dir: {acrobot_run_dir_run2}")

# Training budget (slightly longer than run 1)
acrobot_total_steps_run2 = 300_000
acrobot_rollout_len_run2 = 2048

# Train PPO
acrobot_model_run2, acrobot_episode_returns_run2, acrobot_logs_run2 = train_ppo_single_env(
    env_id=ACROBOT_ENV_ID,
    model=acrobot_model_run2,
    control_type="discrete",
    run_dir=acrobot_run_dir_run2,
    total_env_steps=acrobot_total_steps_run2,
    rollout_len=acrobot_rollout_len_run2,
    gamma=0.99,
    ppo_cfg=acrobot_ppo_cfg_run2,
    lr=2.5e-4,
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(acrobot_run_dir_run2, "ppo_acrobot_episode_returns.npy"),
    np.array(acrobot_episode_returns_run2, dtype=np.float32),
)

acrobot_model_path_run2 = os.path.join(acrobot_run_dir_run2, "ppo_acrobot_model.pth")
torch.save(acrobot_model_run2.state_dict(), acrobot_model_path_run2)
print(f"Saved Acrobot PPO run 2 model to {acrobot_model_path_run2}")

# Training curve
plot_rewards(
    rewards=acrobot_episode_returns_run2,
    run_dir=acrobot_run_dir_run2,
    filename="ppo_acrobot_train_rewards.png",
    title="Acrobot PPO training episode returns (run 2)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_acrobot_run2 = os.path.join(acrobot_run_dir_run2, "ppo_acrobot_eval_log.csv")

acrobot_eval_returns_run2 = evaluate_greedy(
    env_id=ACROBOT_ENV_ID,
    model=acrobot_model_run2,
    control_type="discrete",
    n_episodes=10,
    max_steps=1000,
    base_seed=SEED,
    csv_path=csv_path_acrobot_run2,
)

# Save eval .npy
np.save(
    os.path.join(acrobot_run_dir_run2, "ppo_acrobot_eval_returns.npy"),
    np.array(acrobot_eval_returns_run2, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=acrobot_eval_returns_run2,
    run_dir=acrobot_run_dir_run2,
    filename="ppo_acrobot_eval_rewards.png",
    title="Acrobot PPO greedy evaluation returns (run 2)",
    ma_window=3,
)

# Record video of best greedy episode
record_best_greedy_from_csv(
    env_id=ACROBOT_ENV_ID,
    model=acrobot_model_run2,
    control_type="discrete",
    run_dir=acrobot_run_dir_run2,
    csv_path=csv_path_acrobot_run2,
    max_steps=1000,
)
Acrobot PPO run 2 dir: a3_bonus_ppo_artifacts/acrobot/run_2_acrobot_ppo
[PPO] it=    1 steps=    2048 avg10=-500.00 loss=1855.664 pg=-0.002 vf=3711.348 H=1.095 KL=0.0079 clip_frac=0.035
[PPO] it=   11 steps=   22528 avg10=-162.80 loss=18.072 pg=-0.003 vf=36.160 H=0.609 KL=0.0125 clip_frac=0.109
[PPO] it=   21 steps=   43008 avg10=-146.80 loss=54.360 pg=-0.001 vf=108.728 H=0.428 KL=0.0100 clip_frac=0.061
[PPO] it=   31 steps=   63488 avg10=-107.90 loss=46.132 pg=0.004 vf=92.263 H=0.421 KL=0.0132 clip_frac=0.081
[PPO] it=   41 steps=   83968 avg10=-135.60 loss=25.104 pg=0.000 vf=50.213 H=0.324 KL=0.0242 clip_frac=0.124
[PPO] it=   51 steps=  104448 avg10=-119.90 loss=23.967 pg=0.001 vf=47.937 H=0.284 KL=0.0163 clip_frac=0.074
[PPO] it=   61 steps=  124928 avg10= -92.60 loss=63.052 pg=0.003 vf=126.102 H=0.258 KL=0.0112 clip_frac=0.062
[PPO] it=   71 steps=  145408 avg10= -89.80 loss=16.350 pg=0.002 vf=32.700 H=0.223 KL=0.0093 clip_frac=0.055
[PPO] it=   81 steps=  165888 avg10= -82.10 loss=45.863 pg=-0.007 vf=91.745 H=0.233 KL=0.0527 clip_frac=0.087
[PPO] it=   91 steps=  186368 avg10= -78.30 loss=23.651 pg=0.001 vf=47.304 H=0.210 KL=0.0090 clip_frac=0.039
[PPO] it=  101 steps=  206848 avg10= -97.80 loss=67.326 pg=0.001 vf=134.653 H=0.190 KL=0.0063 clip_frac=0.029
[PPO] it=  111 steps=  227328 avg10= -79.30 loss=10.792 pg=0.002 vf=21.583 H=0.223 KL=0.0097 clip_frac=0.063
[PPO] it=  121 steps=  247808 avg10= -83.90 loss=14.255 pg=0.001 vf=28.512 H=0.216 KL=0.0094 clip_frac=0.050
[PPO] it=  131 steps=  268288 avg10= -88.10 loss=49.104 pg=0.001 vf=98.210 H=0.199 KL=0.0089 clip_frac=0.037
[PPO] it=  141 steps=  288768 avg10= -97.20 loss=36.568 pg=-0.000 vf=73.138 H=0.162 KL=0.0064 clip_frac=0.025
[PPO] it=  147 steps=  300000 avg10= -85.00 loss=10.079 pg=0.000 vf=20.160 H=0.177 KL=0.0089 clip_frac=0.050
[PPO] done steps=300000 time=361.0s avg10=-85.00
Saved Acrobot PPO run 2 model to a3_bonus_ppo_artifacts/acrobot/run_2_acrobot_ppo/ppo_acrobot_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/acrobot/run_2_acrobot_ppo/ppo_acrobot_train_rewards.png
Eval episode 1 seed 1227 return -76.00 steps 77
Eval episode 2 seed 1228 return -84.00 steps 85
Eval episode 3 seed 1229 return -77.00 steps 78
Eval episode 4 seed 1230 return -88.00 steps 89
Eval episode 5 seed 1231 return -96.00 steps 97
Eval episode 6 seed 1232 return -78.00 steps 79
Eval episode 7 seed 1233 return -92.00 steps 93
Eval episode 8 seed 1234 return -80.00 steps 81
Eval episode 9 seed 1235 return -84.00 steps 85
Eval episode 10 seed 1236 return -77.00 steps 78
Greedy evaluation mean -83.20  std 6.57
Saved greedy eval log to a3_bonus_ppo_artifacts/acrobot/run_2_acrobot_ppo/ppo_acrobot_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/acrobot/run_2_acrobot_ppo/ppo_acrobot_eval_rewards.png
Best eval episode from CSV: ep=1, seed=1227, return=-76.00, steps=77
Recorded greedy PPO episode return -76.00 steps 77 with seed 1227 into a3_bonus_ppo_artifacts/acrobot/run_2_acrobot_ppo/videos
Replayed best episode for video: return=-76.00, steps=77

11. PPO on BipedalWalker: training and evaluation¶

This section applies the PPO implementation to the BipedalWalker-v3 environment, which has a continuous action space. A continuous PPO actor–critic network is instantiated using the BipedalWalker observation and action dimensions, reusing the same two-layer architecture as in the A2C and LunarLander experiments. The training loop runs for a fixed number of environment steps and logs episode returns to the BipedalWalker directory. After training, episode returns are plotted, greedy evaluation episodes are run and logged to a CSV file, and a separate figure is saved for the evaluation returns using the same plotting style as the A2C runs. The trained model weights and evaluation statistics are stored under the BipedalWalker, together with one recorded greedy evaluation episode.

Run#1

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model configuration for BipedalWalker (continuous)
bipedal_cfg = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

# fresh model for run 1
bipedal_model_run1 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters for BipedalWalker
bipedal_ppo_cfg_run1 = PPOUpdateConfig(
    clip_range=0.20,
    value_coef=0.5,
    entropy_coef=0.01,
    max_grad_norm=0.5,
    n_epochs=10,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run1 = "run_1_bipedal_ppo"
bipedal_run_dir_run1 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run1)
print(f"BipedalWalker PPO run 1 dir: {bipedal_run_dir_run1}")

# Training budget
bipedal_total_steps_run1 = 600_000
bipedal_rollout_len_run1 = 2048

# Train PPO on BipedalWalker
bipedal_model_run1, bipedal_episode_returns_run1, bipedal_logs_run1 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run1,
    control_type="continuous",
    run_dir=bipedal_run_dir_run1,
    total_env_steps=bipedal_total_steps_run1,
    rollout_len=bipedal_rollout_len_run1,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run1,
    lr=3e-4,
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run1, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run1, dtype=np.float32),
)

bipedal_model_path_run1 = os.path.join(bipedal_run_dir_run1, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run1.state_dict(), bipedal_model_path_run1)
print(f"Saved BipedalWalker PPO run 1 model to {bipedal_model_path_run1}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run1,
    run_dir=bipedal_run_dir_run1,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 1)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run1 = os.path.join(bipedal_run_dir_run1, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run1 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run1,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,              # full BipedalWalker episode length
    base_seed=SEED,
    csv_path=csv_path_bipedal_run1,
)

# save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run1, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run1, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run1,
    run_dir=bipedal_run_dir_run1,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 1)",
    ma_window=3,
)

# Record video of the *best* greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run1,
    control_type="continuous",
    run_dir=bipedal_run_dir_run1,
    csv_path=csv_path_bipedal_run1,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 1 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_1_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-109.26 loss=202.330 pg=0.013 vf=404.748 H=5.687 KL=0.0290 clip_frac=0.197
[PPO] it=   11 steps=   22528 avg10=-110.59 loss=10.201 pg=0.017 vf=20.484 H=5.828 KL=0.0258 clip_frac=0.202
[PPO] it=   21 steps=   43008 avg10=-112.58 loss=0.203 pg=0.005 vf=0.513 H=5.902 KL=0.0257 clip_frac=0.248
[PPO] it=   31 steps=   63488 avg10=-109.02 loss=0.099 pg=-0.004 vf=0.324 H=5.905 KL=0.0214 clip_frac=0.181
[PPO] it=   41 steps=   83968 avg10=-100.35 loss=18.498 pg=0.030 vf=37.053 H=5.866 KL=0.0976 clip_frac=0.422
[PPO] it=   51 steps=  104448 avg10=-127.50 loss=17.427 pg=0.019 vf=34.937 H=6.042 KL=0.0503 clip_frac=0.263
[PPO] it=   61 steps=  124928 avg10=-141.24 loss=0.083 pg=0.008 vf=0.274 H=6.274 KL=0.0394 clip_frac=0.397
[PPO] it=   71 steps=  145408 avg10=-149.76 loss=7.327 pg=0.011 vf=14.759 H=6.405 KL=0.0271 clip_frac=0.254
[PPO] it=   81 steps=  165888 avg10=-141.56 loss=0.070 pg=-0.000 vf=0.272 H=6.591 KL=0.0153 clip_frac=0.140
[PPO] it=   91 steps=  186368 avg10=-141.22 loss=-0.064 pg=-0.008 vf=0.018 H=6.588 KL=0.0126 clip_frac=0.079
[PPO] it=  101 steps=  206848 avg10=-137.75 loss=-0.061 pg=-0.007 vf=0.018 H=6.357 KL=0.0159 clip_frac=0.107
[PPO] it=  111 steps=  227328 avg10=-125.96 loss=-0.014 pg=-0.008 vf=0.111 H=6.128 KL=0.0165 clip_frac=0.083
[PPO] it=  121 steps=  247808 avg10=-104.86 loss=-0.019 pg=-0.012 vf=0.104 H=5.913 KL=0.0258 clip_frac=0.132
[PPO] it=  131 steps=  268288 avg10= -90.70 loss=-0.000 pg=-0.015 vf=0.146 H=5.823 KL=0.0175 clip_frac=0.140
[PPO] it=  141 steps=  288768 avg10= -71.88 loss=0.001 pg=-0.016 vf=0.148 H=5.719 KL=0.0178 clip_frac=0.157
[PPO] it=  151 steps=  309248 avg10= -60.80 loss=0.162 pg=-0.019 vf=0.473 H=5.651 KL=0.0183 clip_frac=0.186
[PPO] it=  161 steps=  329728 avg10= -27.61 loss=0.449 pg=-0.018 vf=1.047 H=5.651 KL=0.0195 clip_frac=0.178
[PPO] it=  171 steps=  350208 avg10=  12.94 loss=0.308 pg=-0.013 vf=0.755 H=5.681 KL=0.0237 clip_frac=0.216
[PPO] it=  181 steps=  370688 avg10=   7.21 loss=0.242 pg=-0.021 vf=0.642 H=5.803 KL=0.0312 clip_frac=0.226
[PPO] it=  191 steps=  391168 avg10=   8.28 loss=22.491 pg=-0.014 vf=45.126 H=5.868 KL=1.9218 clip_frac=0.467
[PPO] it=  201 steps=  411648 avg10=  42.80 loss=4.188 pg=0.143 vf=8.209 H=5.921 KL=0.4008 clip_frac=0.535
[PPO] it=  211 steps=  432128 avg10=-107.70 loss=0.539 pg=0.024 vf=1.150 H=6.072 KL=0.0663 clip_frac=0.472
[PPO] it=  221 steps=  452608 avg10=-105.74 loss=0.950 pg=0.027 vf=1.972 H=6.264 KL=0.0800 clip_frac=0.481
[PPO] it=  231 steps=  473088 avg10=-107.90 loss=0.158 pg=0.017 vf=0.413 H=6.470 KL=0.0485 clip_frac=0.407
[PPO] it=  241 steps=  493568 avg10=-107.79 loss=0.335 pg=0.007 vf=0.790 H=6.749 KL=0.0493 clip_frac=0.413
[PPO] it=  251 steps=  514048 avg10=-108.08 loss=0.243 pg=0.009 vf=0.606 H=6.969 KL=0.0509 clip_frac=0.415
[PPO] it=  261 steps=  534528 avg10=-108.15 loss=0.257 pg=0.009 vf=0.639 H=7.204 KL=0.0495 clip_frac=0.368
[PPO] it=  271 steps=  555008 avg10=-108.34 loss=0.224 pg=0.005 vf=0.585 H=7.299 KL=0.0392 clip_frac=0.346
[PPO] it=  281 steps=  575488 avg10=-108.41 loss=0.166 pg=0.001 vf=0.479 H=7.451 KL=0.0386 clip_frac=0.331
[PPO] it=  291 steps=  595968 avg10=-108.52 loss=0.406 pg=-0.002 vf=0.967 H=7.584 KL=0.0300 clip_frac=0.321
[PPO] it=  293 steps=  600000 avg10=-108.35 loss=0.247 pg=0.001 vf=0.643 H=7.586 KL=0.0289 clip_frac=0.293
[PPO] done steps=600000 time=1325.8s avg10=-108.35
Saved BipedalWalker PPO run 1 model to a3_bonus_ppo_artifacts/bipedal_walker/run_1_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_1_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -108.62 steps 109
Eval episode 2 seed 1228 return -108.56 steps 109
Eval episode 3 seed 1229 return -108.34 steps 108
Eval episode 4 seed 1230 return -108.91 steps 108
Eval episode 5 seed 1231 return -108.34 steps 108
Eval episode 6 seed 1232 return -108.56 steps 109
Eval episode 7 seed 1233 return -108.63 steps 108
Eval episode 8 seed 1234 return -108.56 steps 109
Eval episode 9 seed 1235 return -108.75 steps 108
Eval episode 10 seed 1236 return -108.59 steps 109
Greedy evaluation mean -108.58  std 0.16
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_1_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_1_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=5, seed=1231, return=-108.34, steps=108
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_1_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
/usr/local/lib/python3.12/dist-packages/moviepy/config_defaults.py:47: SyntaxWarning: invalid escape sequence '\P'
  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
Recorded greedy PPO episode return -108.34 steps 108 with seed 1231 into a3_bonus_ppo_artifacts/bipedal_walker/run_1_bipedal_ppo/videos
Replayed best episode for video: return=-108.34, steps=108

Run#2

In [ ]:
# reuse obs_dim_bipedal, act_dim_bipedal and bipedal_cfg from run 1

# fresh model for run 2
bipedal_model_run2 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters for BipedalWalker – more conservative than run 1
bipedal_ppo_cfg_run2 = PPOUpdateConfig(
    clip_range=0.15,     # smaller clip for more stable updates
    value_coef=0.5,
    entropy_coef=0.005,  # a bit less exploration noise
    max_grad_norm=0.5,
    n_epochs=5,          # fewer passes per batch to reduce overfitting / KL spikes
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run2 = "run_2_bipedal_ppo"
bipedal_run_dir_run2 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run2)
print(f"BipedalWalker PPO run 2 dir: {bipedal_run_dir_run2}")

# Training budget
bipedal_total_steps_run2 = 800_000
bipedal_rollout_len_run2 = 2048

# Train PPO
bipedal_model_run2, bipedal_episode_returns_run2, bipedal_logs_run2 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run2,
    control_type="continuous",
    run_dir=bipedal_run_dir_run2,
    total_env_steps=bipedal_total_steps_run2,
    rollout_len=bipedal_rollout_len_run2,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run2,
    lr=2.0e-4,          # a bit smaller than run 1
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run2, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run2, dtype=np.float32),
)

bipedal_model_path_run2 = os.path.join(bipedal_run_dir_run2, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run2.state_dict(), bipedal_model_path_run2)
print(f"Saved BipedalWalker PPO run 2 model to {bipedal_model_path_run2}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run2,
    run_dir=bipedal_run_dir_run2,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 2)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run2 = os.path.join(bipedal_run_dir_run2, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run2 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run2,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run2,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run2, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run2, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run2,
    run_dir=bipedal_run_dir_run2,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 2)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run2,
    control_type="continuous",
    run_dir=bipedal_run_dir_run2,
    csv_path=csv_path_bipedal_run2,
    max_steps=1600,
)
BipedalWalker PPO run 2 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_2_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-107.31 loss=63.824 pg=-0.001 vf=127.706 H=5.681 KL=0.0091 clip_frac=0.097
[PPO] it=   11 steps=   22528 avg10=-106.65 loss=0.214 pg=0.001 vf=0.483 H=5.705 KL=0.0115 clip_frac=0.146
[PPO] it=   21 steps=   43008 avg10= -95.83 loss=10.980 pg=0.006 vf=22.005 H=5.665 KL=0.0087 clip_frac=0.093
[PPO] it=   31 steps=   63488 avg10= -97.54 loss=22.132 pg=0.003 vf=44.316 H=5.652 KL=0.0115 clip_frac=0.114
[PPO] it=   41 steps=   83968 avg10= -94.20 loss=0.930 pg=0.003 vf=1.909 H=5.630 KL=0.0104 clip_frac=0.135
[PPO] it=   51 steps=  104448 avg10= -89.29 loss=0.220 pg=0.001 vf=0.493 H=5.626 KL=0.0095 clip_frac=0.108
[PPO] it=   61 steps=  124928 avg10= -95.65 loss=0.317 pg=0.001 vf=0.686 H=5.567 KL=0.0100 clip_frac=0.136
[PPO] it=   71 steps=  145408 avg10= -81.67 loss=0.220 pg=0.005 vf=0.485 H=5.590 KL=0.0105 clip_frac=0.156
[PPO] it=   81 steps=  165888 avg10= -90.17 loss=7.052 pg=0.002 vf=14.156 H=5.577 KL=0.0078 clip_frac=0.085
[PPO] it=   91 steps=  186368 avg10= -87.54 loss=0.092 pg=0.001 vf=0.238 H=5.554 KL=0.0088 clip_frac=0.082
[PPO] it=  101 steps=  206848 avg10= -77.47 loss=0.057 pg=-0.002 vf=0.174 H=5.561 KL=0.0110 clip_frac=0.138
[PPO] it=  111 steps=  227328 avg10= -64.73 loss=0.194 pg=-0.001 vf=0.445 H=5.554 KL=0.0089 clip_frac=0.091
[PPO] it=  121 steps=  247808 avg10= -35.66 loss=0.203 pg=-0.007 vf=0.475 H=5.491 KL=0.0103 clip_frac=0.110
[PPO] it=  131 steps=  268288 avg10= -13.81 loss=0.302 pg=-0.001 vf=0.659 H=5.430 KL=0.0098 clip_frac=0.124
[PPO] it=  141 steps=  288768 avg10=  15.95 loss=22.927 pg=-0.008 vf=45.922 H=5.375 KL=0.2052 clip_frac=0.163
[PPO] it=  151 steps=  309248 avg10=  31.40 loss=23.785 pg=0.004 vf=47.615 H=5.370 KL=0.0637 clip_frac=0.179
[PPO] it=  161 steps=  329728 avg10=  63.76 loss=0.854 pg=0.001 vf=1.758 H=5.313 KL=0.0138 clip_frac=0.171
[PPO] it=  171 steps=  350208 avg10= -11.42 loss=38.820 pg=0.128 vf=77.436 H=5.302 KL=0.2185 clip_frac=0.392
[PPO] it=  181 steps=  370688 avg10=-101.56 loss=9.595 pg=0.115 vf=19.014 H=5.333 KL=0.2572 clip_frac=0.695
[PPO] it=  191 steps=  391168 avg10=-101.80 loss=2.575 pg=0.026 vf=5.154 H=5.439 KL=0.0459 clip_frac=0.483
[PPO] it=  201 steps=  411648 avg10=-101.98 loss=0.821 pg=0.011 vf=1.675 H=5.533 KL=0.0274 clip_frac=0.371
[PPO] it=  211 steps=  432128 avg10=-101.88 loss=0.838 pg=0.006 vf=1.721 H=5.665 KL=0.0213 clip_frac=0.333
[PPO] it=  221 steps=  452608 avg10=-102.22 loss=0.747 pg=0.008 vf=1.535 H=5.680 KL=0.0252 clip_frac=0.340
[PPO] it=  231 steps=  473088 avg10=-102.24 loss=1.581 pg=0.012 vf=3.195 H=5.736 KL=0.0218 clip_frac=0.340
[PPO] it=  241 steps=  493568 avg10=-121.95 loss=8.827 pg=0.019 vf=17.675 H=5.749 KL=0.0296 clip_frac=0.308
[PPO] it=  251 steps=  514048 avg10=-128.43 loss=1.138 pg=0.014 vf=2.306 H=5.828 KL=0.0203 clip_frac=0.298
[PPO] it=  261 steps=  534528 avg10=-107.23 loss=0.275 pg=0.007 vf=0.593 H=5.833 KL=0.0178 clip_frac=0.271
[PPO] it=  271 steps=  555008 avg10=-115.50 loss=19.585 pg=0.003 vf=39.223 H=5.920 KL=0.0117 clip_frac=0.123
[PPO] it=  281 steps=  575488 avg10=-119.21 loss=0.598 pg=0.002 vf=1.252 H=6.008 KL=0.0120 clip_frac=0.162
[PPO] it=  291 steps=  595968 avg10=-120.78 loss=23.592 pg=0.008 vf=47.228 H=5.999 KL=0.0163 clip_frac=0.190
[PPO] it=  301 steps=  616448 avg10=-105.79 loss=30.162 pg=-0.000 vf=60.384 H=6.024 KL=0.0133 clip_frac=0.180
[PPO] it=  311 steps=  636928 avg10=-115.62 loss=39.452 pg=0.014 vf=78.937 H=6.064 KL=0.0207 clip_frac=0.192
[PPO] it=  321 steps=  657408 avg10= -96.41 loss=0.835 pg=0.014 vf=1.702 H=6.110 KL=0.0179 clip_frac=0.303
[PPO] it=  331 steps=  677888 avg10= -79.66 loss=0.513 pg=0.004 vf=1.079 H=6.167 KL=0.0139 clip_frac=0.191
[PPO] it=  341 steps=  698368 avg10= -86.15 loss=0.697 pg=0.008 vf=1.441 H=6.185 KL=0.0126 clip_frac=0.226
[PPO] it=  351 steps=  718848 avg10= -73.99 loss=9.093 pg=-0.002 vf=18.252 H=6.220 KL=0.0134 clip_frac=0.156
[PPO] it=  361 steps=  739328 avg10= -77.36 loss=23.595 pg=0.020 vf=47.212 H=6.284 KL=0.0349 clip_frac=0.207
[PPO] it=  371 steps=  759808 avg10= -60.45 loss=0.612 pg=0.003 vf=1.281 H=6.313 KL=0.0118 clip_frac=0.192
[PPO] it=  381 steps=  780288 avg10= -74.80 loss=19.683 pg=0.015 vf=39.399 H=6.341 KL=0.0194 clip_frac=0.269
[PPO] it=  391 steps=  800000 avg10= -94.80 loss=4.575 pg=0.022 vf=9.170 H=6.369 KL=0.0302 clip_frac=0.328
[PPO] done steps=800000 time=1069.6s avg10=-94.80
Saved BipedalWalker PPO run 2 model to a3_bonus_ppo_artifacts/bipedal_walker/run_2_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_2_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -93.89 steps 1371
Eval episode 2 seed 1228 return 36.13 steps 1600
Eval episode 3 seed 1229 return -109.15 steps 473
Eval episode 4 seed 1230 return -39.60 steps 1600
Eval episode 5 seed 1231 return -109.42 steps 289
Eval episode 6 seed 1232 return -106.15 steps 358
Eval episode 7 seed 1233 return -90.44 steps 1534
Eval episode 8 seed 1234 return -38.24 steps 1328
Eval episode 9 seed 1235 return -110.32 steps 454
Eval episode 10 seed 1236 return -59.39 steps 1341
Greedy evaluation mean -72.05  std 45.03
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_2_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_2_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=2, seed=1228, return=36.13, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_2_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 36.13 steps 1600 with seed 1228 into a3_bonus_ppo_artifacts/bipedal_walker/run_2_bipedal_ppo/videos
Replayed best episode for video: return=36.13, steps=1600

Run#3

In [ ]:
# reuse obs_dim_bipedal, act_dim_bipedal and bipedal_cfg from above

# fresh model for run 3
bipedal_model_run3 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters for BipedalWalker – more conservative / stable
bipedal_ppo_cfg_run3 = PPOUpdateConfig(
    clip_range=0.10,     # smaller clip range for gentler policy updates
    value_coef=0.5,
    entropy_coef=0.003,  # slightly lower exploration
    max_grad_norm=0.5,
    n_epochs=3,          # fewer epochs per batch to avoid over-updating
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run3 = "run_3_bipedal_ppo"
bipedal_run_dir_run3 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run3)
print(f"BipedalWalker PPO run 3 dir: {bipedal_run_dir_run3}")

# Training budget
bipedal_total_steps_run3 = 1_000_000
bipedal_rollout_len_run3 = 2048

# Train PPO
bipedal_model_run3, bipedal_episode_returns_run3, bipedal_logs_run3 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run3,
    control_type="continuous",
    run_dir=bipedal_run_dir_run3,
    total_env_steps=bipedal_total_steps_run3,
    rollout_len=bipedal_rollout_len_run3,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run3,
    lr=1.0e-4,          # smaller learning rate than run 2
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run3, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run3, dtype=np.float32),
)

bipedal_model_path_run3 = os.path.join(bipedal_run_dir_run3, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run3.state_dict(), bipedal_model_path_run3)
print(f"Saved BipedalWalker PPO run 3 model to {bipedal_model_path_run3}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run3,
    run_dir=bipedal_run_dir_run3,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 3)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run3 = os.path.join(bipedal_run_dir_run3, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run3 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run3,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run3,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run3, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run3, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run3,
    run_dir=bipedal_run_dir_run3,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 3)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run3,
    control_type="continuous",
    run_dir=bipedal_run_dir_run3,
    csv_path=csv_path_bipedal_run3,
    max_steps=1600,
)
BipedalWalker PPO run 3 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_3_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-103.62 loss=163.674 pg=-0.002 vf=327.385 H=5.677 KL=0.0046 clip_frac=0.049
[PPO] it=   11 steps=   22528 avg10=-108.12 loss=87.907 pg=-0.001 vf=175.851 H=5.659 KL=0.0045 clip_frac=0.040
[PPO] it=   21 steps=   43008 avg10=-109.38 loss=35.685 pg=0.001 vf=71.402 H=5.633 KL=0.0054 clip_frac=0.054
[PPO] it=   31 steps=   63488 avg10=-100.87 loss=0.065 pg=0.000 vf=0.163 H=5.614 KL=0.0046 clip_frac=0.075
[PPO] it=   41 steps=   83968 avg10=-101.56 loss=0.064 pg=-0.001 vf=0.165 H=5.599 KL=0.0053 clip_frac=0.049
[PPO] it=   51 steps=  104448 avg10= -90.48 loss=0.054 pg=-0.004 vf=0.150 H=5.596 KL=0.0059 clip_frac=0.092
[PPO] it=   61 steps=  124928 avg10= -93.58 loss=55.759 pg=0.013 vf=111.525 H=5.584 KL=0.0087 clip_frac=0.198
[PPO] it=   71 steps=  145408 avg10= -91.71 loss=0.831 pg=0.002 vf=1.690 H=5.560 KL=0.0077 clip_frac=0.178
[PPO] it=   81 steps=  165888 avg10= -89.49 loss=0.343 pg=0.001 vf=0.718 H=5.572 KL=0.0045 clip_frac=0.042
[PPO] it=   91 steps=  186368 avg10= -95.37 loss=0.548 pg=0.000 vf=1.130 H=5.571 KL=0.0054 clip_frac=0.125
[PPO] it=  101 steps=  206848 avg10= -87.23 loss=0.079 pg=0.000 vf=0.190 H=5.571 KL=0.0033 clip_frac=0.015
[PPO] it=  111 steps=  227328 avg10= -89.01 loss=0.060 pg=-0.000 vf=0.153 H=5.555 KL=0.0041 clip_frac=0.051
[PPO] it=  121 steps=  247808 avg10= -80.83 loss=28.245 pg=0.016 vf=56.491 H=5.545 KL=0.0122 clip_frac=0.227
[PPO] it=  131 steps=  268288 avg10= -92.12 loss=19.088 pg=0.002 vf=38.205 H=5.543 KL=0.0111 clip_frac=0.222
[PPO] it=  141 steps=  288768 avg10= -81.45 loss=0.112 pg=-0.002 vf=0.261 H=5.536 KL=0.0044 clip_frac=0.047
[PPO] it=  151 steps=  309248 avg10=-125.75 loss=7.629 pg=0.004 vf=15.283 H=5.536 KL=0.0059 clip_frac=0.063
[PPO] it=  161 steps=  329728 avg10=-128.22 loss=1.191 pg=-0.001 vf=2.418 H=5.540 KL=0.0059 clip_frac=0.070
[PPO] it=  171 steps=  350208 avg10=-118.82 loss=5.511 pg=0.012 vf=11.032 H=5.535 KL=0.0121 clip_frac=0.212
[PPO] it=  181 steps=  370688 avg10=-109.60 loss=0.305 pg=0.000 vf=0.644 H=5.555 KL=0.0048 clip_frac=0.048
[PPO] it=  191 steps=  391168 avg10= -79.87 loss=0.175 pg=0.001 vf=0.381 H=5.541 KL=0.0048 clip_frac=0.025
[PPO] it=  201 steps=  411648 avg10=-117.27 loss=38.686 pg=0.002 vf=77.400 H=5.528 KL=0.0073 clip_frac=0.124
[PPO] it=  211 steps=  432128 avg10=-125.28 loss=55.293 pg=0.002 vf=110.616 H=5.528 KL=0.0057 clip_frac=0.081
[PPO] it=  221 steps=  452608 avg10=-144.21 loss=9.123 pg=0.002 vf=18.275 H=5.526 KL=0.0054 clip_frac=0.046
[PPO] it=  231 steps=  473088 avg10=-158.92 loss=42.913 pg=-0.000 vf=85.860 H=5.525 KL=0.0051 clip_frac=0.074
[PPO] it=  241 steps=  493568 avg10=-159.07 loss=2.050 pg=0.003 vf=4.128 H=5.527 KL=0.0074 clip_frac=0.093
[PPO] it=  251 steps=  514048 avg10=-142.35 loss=45.366 pg=0.001 vf=90.763 H=5.518 KL=0.0046 clip_frac=0.038
[PPO] it=  261 steps=  534528 avg10=-157.50 loss=39.039 pg=0.018 vf=78.075 H=5.517 KL=0.0270 clip_frac=0.251
[PPO] it=  271 steps=  555008 avg10=-158.52 loss=27.621 pg=0.003 vf=55.267 H=5.519 KL=0.0074 clip_frac=0.154
[PPO] it=  281 steps=  575488 avg10=-146.14 loss=20.192 pg=0.001 vf=40.416 H=5.528 KL=0.0073 clip_frac=0.081
[PPO] it=  291 steps=  595968 avg10=-164.49 loss=37.557 pg=0.008 vf=75.131 H=5.531 KL=0.0111 clip_frac=0.240
[PPO] it=  301 steps=  616448 avg10=-151.16 loss=0.100 pg=0.001 vf=0.232 H=5.528 KL=0.0056 clip_frac=0.072
[PPO] it=  311 steps=  636928 avg10=-137.17 loss=45.724 pg=0.007 vf=91.465 H=5.512 KL=0.0101 clip_frac=0.210
[PPO] it=  321 steps=  657408 avg10=-122.86 loss=18.384 pg=0.001 vf=36.800 H=5.521 KL=0.0044 clip_frac=0.049
[PPO] it=  331 steps=  677888 avg10= -99.06 loss=10.277 pg=0.003 vf=20.581 H=5.516 KL=0.0070 clip_frac=0.078
[PPO] it=  341 steps=  698368 avg10= -97.26 loss=0.952 pg=0.000 vf=1.936 H=5.515 KL=0.0064 clip_frac=0.102
[PPO] it=  351 steps=  718848 avg10=-129.39 loss=30.415 pg=0.001 vf=60.861 H=5.501 KL=0.0053 clip_frac=0.048
[PPO] it=  361 steps=  739328 avg10=-141.96 loss=50.073 pg=0.006 vf=100.167 H=5.497 KL=0.0066 clip_frac=0.101
[PPO] it=  371 steps=  759808 avg10= -96.79 loss=0.035 pg=-0.000 vf=0.103 H=5.486 KL=0.0055 clip_frac=0.067
[PPO] it=  381 steps=  780288 avg10= -99.85 loss=0.634 pg=0.000 vf=1.299 H=5.486 KL=0.0040 clip_frac=0.024
[PPO] it=  391 steps=  800768 avg10= -99.06 loss=17.625 pg=0.005 vf=35.273 H=5.471 KL=0.0068 clip_frac=0.075
[PPO] it=  401 steps=  821248 avg10= -93.89 loss=13.723 pg=0.003 vf=27.474 H=5.468 KL=0.0066 clip_frac=0.069
[PPO] it=  411 steps=  841728 avg10= -89.34 loss=0.429 pg=0.003 vf=0.886 H=5.476 KL=0.0040 clip_frac=0.039
[PPO] it=  421 steps=  862208 avg10= -89.89 loss=0.140 pg=-0.000 vf=0.314 H=5.460 KL=0.0047 clip_frac=0.043
[PPO] it=  431 steps=  882688 avg10= -85.14 loss=0.170 pg=0.001 vf=0.371 H=5.453 KL=0.0069 clip_frac=0.076
[PPO] it=  441 steps=  903168 avg10=-107.68 loss=3.266 pg=0.001 vf=6.562 H=5.452 KL=0.0080 clip_frac=0.064
[PPO] it=  451 steps=  923648 avg10= -82.90 loss=0.101 pg=-0.000 vf=0.235 H=5.442 KL=0.0055 clip_frac=0.059
[PPO] it=  461 steps=  944128 avg10=-109.09 loss=16.912 pg=0.003 vf=33.852 H=5.443 KL=0.0065 clip_frac=0.058
[PPO] it=  471 steps=  964608 avg10= -76.99 loss=0.056 pg=-0.003 vf=0.151 H=5.424 KL=0.0059 clip_frac=0.090
[PPO] it=  481 steps=  985088 avg10= -97.48 loss=13.074 pg=0.006 vf=26.168 H=5.427 KL=0.0071 clip_frac=0.176
[PPO] it=  489 steps= 1000000 avg10= -75.67 loss=0.299 pg=-0.001 vf=0.632 H=5.419 KL=0.0049 clip_frac=0.078
[PPO] done steps=1000000 time=1481.7s avg10=-75.67
Saved BipedalWalker PPO run 3 model to a3_bonus_ppo_artifacts/bipedal_walker/run_3_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_3_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -91.52 steps 1600
Eval episode 2 seed 1228 return -93.38 steps 1600
Eval episode 3 seed 1229 return -93.91 steps 1600
Eval episode 4 seed 1230 return -93.43 steps 1600
Eval episode 5 seed 1231 return -94.29 steps 1600
Eval episode 6 seed 1232 return -93.78 steps 1600
Eval episode 7 seed 1233 return -94.85 steps 1600
Eval episode 8 seed 1234 return -93.94 steps 1600
Eval episode 9 seed 1235 return -94.17 steps 1600
Eval episode 10 seed 1236 return -94.32 steps 1600
Greedy evaluation mean -93.76  std 0.85
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_3_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_3_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=1, seed=1227, return=-91.52, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_3_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -91.52 steps 1600 with seed 1227 into a3_bonus_ppo_artifacts/bipedal_walker/run_3_bipedal_ppo/videos
Replayed best episode for video: return=-91.52, steps=1600

Run#4

In [ ]:
# reuse obs_dim_bipedal, act_dim_bipedal, and bipedal_cfg from above

# fresh model for run 4
bipedal_model_run4 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters for BipedalWalker – between run 2 and 3
bipedal_ppo_cfg_run4 = PPOUpdateConfig(
    clip_range=0.12,     # gentle updates (smaller than 0.15, larger than 0.10)
    value_coef=0.5,
    entropy_coef=0.004,  # small exploration noise
    max_grad_norm=0.5,
    n_epochs=4,          # moderate number of epochs
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run4 = "run_4_bipedal_ppo"
bipedal_run_dir_run4 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run4)
print(f"BipedalWalker PPO run 4 dir: {bipedal_run_dir_run4}")

# Training budget
bipedal_total_steps_run4 = 700_000
bipedal_rollout_len_run4 = 2048

# Train PPO
bipedal_model_run4, bipedal_episode_returns_run4, bipedal_logs_run4 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run4,
    control_type="continuous",
    run_dir=bipedal_run_dir_run4,
    total_env_steps=bipedal_total_steps_run4,
    rollout_len=bipedal_rollout_len_run4,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run4,
    lr=1.5e-4,
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run4, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run4, dtype=np.float32),
)

bipedal_model_path_run4 = os.path.join(bipedal_run_dir_run4, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run4.state_dict(), bipedal_model_path_run4)
print(f"Saved BipedalWalker PPO run 4 model to {bipedal_model_path_run4}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run4,
    run_dir=bipedal_run_dir_run4,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 4)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run4 = os.path.join(bipedal_run_dir_run4, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run4 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run4,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run4,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run4, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run4, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run4,
    run_dir=bipedal_run_dir_run4,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 4)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run4,
    control_type="continuous",
    run_dir=bipedal_run_dir_run4,
    csv_path=csv_path_bipedal_run4,
    max_steps=1600,
)
BipedalWalker PPO run 4 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_4_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-102.77 loss=63.312 pg=-0.003 vf=126.675 H=5.682 KL=0.0057 clip_frac=0.075
[PPO] it=   11 steps=   22528 avg10=-103.34 loss=0.204 pg=0.002 vf=0.449 H=5.670 KL=0.0081 clip_frac=0.162
[PPO] it=   21 steps=   43008 avg10= -91.35 loss=0.059 pg=-0.002 vf=0.167 H=5.631 KL=0.0077 clip_frac=0.116
[PPO] it=   31 steps=   63488 avg10=-111.24 loss=72.785 pg=0.016 vf=145.583 H=5.619 KL=0.0154 clip_frac=0.288
[PPO] it=   41 steps=   83968 avg10= -91.08 loss=0.104 pg=-0.000 vf=0.254 H=5.608 KL=0.0070 clip_frac=0.083
[PPO] it=   51 steps=  104448 avg10= -91.75 loss=0.106 pg=0.000 vf=0.257 H=5.592 KL=0.0063 clip_frac=0.072
[PPO] it=   61 steps=  124928 avg10= -86.32 loss=0.170 pg=-0.001 vf=0.385 H=5.583 KL=0.0049 clip_frac=0.058
[PPO] it=   71 steps=  145408 avg10= -80.89 loss=0.581 pg=-0.001 vf=1.208 H=5.522 KL=0.0060 clip_frac=0.041
[PPO] it=   81 steps=  165888 avg10= -74.40 loss=0.065 pg=-0.002 vf=0.177 H=5.487 KL=0.0063 clip_frac=0.068
[PPO] it=   91 steps=  186368 avg10= -85.32 loss=0.477 pg=0.002 vf=0.994 H=5.475 KL=0.0074 clip_frac=0.084
[PPO] it=  101 steps=  206848 avg10= -88.14 loss=21.542 pg=0.006 vf=43.114 H=5.475 KL=0.0138 clip_frac=0.190
[PPO] it=  111 steps=  227328 avg10= -74.17 loss=11.550 pg=0.006 vf=23.131 H=5.475 KL=0.0121 clip_frac=0.116
[PPO] it=  121 steps=  247808 avg10= -78.70 loss=0.110 pg=-0.003 vf=0.268 H=5.432 KL=0.0071 clip_frac=0.082
[PPO] it=  131 steps=  268288 avg10= -71.62 loss=0.370 pg=-0.001 vf=0.786 H=5.428 KL=0.0063 clip_frac=0.060
[PPO] it=  141 steps=  288768 avg10= -60.59 loss=0.275 pg=-0.000 vf=0.593 H=5.397 KL=0.0061 clip_frac=0.064
[PPO] it=  151 steps=  309248 avg10= -39.22 loss=0.189 pg=-0.002 vf=0.425 H=5.405 KL=0.0075 clip_frac=0.068
[PPO] it=  161 steps=  329728 avg10= -25.88 loss=1.001 pg=-0.002 vf=2.047 H=5.396 KL=0.0055 clip_frac=0.063
[PPO] it=  171 steps=  350208 avg10=  -4.93 loss=0.635 pg=-0.002 vf=1.318 H=5.400 KL=0.0078 clip_frac=0.081
[PPO] it=  181 steps=  370688 avg10=  54.13 loss=0.841 pg=-0.002 vf=1.729 H=5.362 KL=0.0077 clip_frac=0.087
[PPO] it=  191 steps=  391168 avg10=  56.49 loss=0.277 pg=-0.002 vf=0.601 H=5.337 KL=0.0079 clip_frac=0.088
[PPO] it=  201 steps=  411648 avg10= 103.91 loss=0.855 pg=-0.002 vf=1.757 H=5.308 KL=0.0072 clip_frac=0.099
[PPO] it=  211 steps=  432128 avg10= 116.55 loss=0.400 pg=-0.002 vf=0.846 H=5.244 KL=0.0075 clip_frac=0.108
[PPO] it=  221 steps=  452608 avg10= 116.88 loss=0.512 pg=-0.001 vf=1.068 H=5.208 KL=0.0077 clip_frac=0.079
[PPO] it=  231 steps=  473088 avg10= 125.05 loss=0.425 pg=-0.001 vf=0.894 H=5.196 KL=0.0071 clip_frac=0.104
[PPO] it=  241 steps=  493568 avg10=  82.96 loss=1.581 pg=0.014 vf=3.176 H=5.184 KL=0.0157 clip_frac=0.312
[PPO] it=  251 steps=  514048 avg10=-123.27 loss=17.752 pg=0.020 vf=35.506 H=5.182 KL=0.0340 clip_frac=0.436
[PPO] it=  261 steps=  534528 avg10=-116.05 loss=15.919 pg=0.033 vf=31.814 H=5.184 KL=0.0481 clip_frac=0.563
[PPO] it=  271 steps=  555008 avg10=-118.84 loss=17.938 pg=0.018 vf=35.881 H=5.189 KL=0.0293 clip_frac=0.426
[PPO] it=  281 steps=  575488 avg10=-116.19 loss=9.962 pg=0.021 vf=19.924 H=5.208 KL=0.0275 clip_frac=0.464
[PPO] it=  291 steps=  595968 avg10=-117.48 loss=22.402 pg=0.023 vf=44.800 H=5.220 KL=0.0306 clip_frac=0.472
[PPO] it=  301 steps=  616448 avg10=-116.58 loss=13.080 pg=0.022 vf=26.157 H=5.241 KL=0.0258 clip_frac=0.425
[PPO] it=  311 steps=  636928 avg10=-115.78 loss=15.647 pg=0.013 vf=31.310 H=5.268 KL=0.0215 clip_frac=0.388
[PPO] it=  321 steps=  657408 avg10=-118.99 loss=12.884 pg=0.008 vf=25.794 H=5.286 KL=0.0223 clip_frac=0.327
[PPO] it=  331 steps=  677888 avg10=-118.88 loss=13.296 pg=0.006 vf=26.624 H=5.303 KL=0.0208 clip_frac=0.350
[PPO] it=  341 steps=  698368 avg10=-115.81 loss=11.992 pg=0.033 vf=23.960 H=5.312 KL=0.0325 clip_frac=0.545
[PPO] it=  342 steps=  700000 avg10=-117.34 loss=5.591 pg=0.016 vf=11.193 H=5.314 KL=0.0203 clip_frac=0.358
[PPO] done steps=700000 time=1108.0s avg10=-117.34
Saved BipedalWalker PPO run 4 model to a3_bonus_ppo_artifacts/bipedal_walker/run_4_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_4_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -118.84 steps 166
Eval episode 2 seed 1228 return -118.87 steps 165
Eval episode 3 seed 1229 return -119.06 steps 167
Eval episode 4 seed 1230 return -118.53 steps 162
Eval episode 5 seed 1231 return -119.08 steps 167
Eval episode 6 seed 1232 return -118.76 steps 164
Eval episode 7 seed 1233 return -118.86 steps 165
Eval episode 8 seed 1234 return -118.92 steps 166
Eval episode 9 seed 1235 return -118.63 steps 163
Eval episode 10 seed 1236 return -118.76 steps 164
Greedy evaluation mean -118.83  std 0.16
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_4_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_4_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=4, seed=1230, return=-118.53, steps=162
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_4_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -118.53 steps 162 with seed 1230 into a3_bonus_ppo_artifacts/bipedal_walker/run_4_bipedal_ppo/videos
Replayed best episode for video: return=-118.53, steps=162

Run#5

In [ ]:
# reuse obs_dim_bipedal, act_dim_bipedal, and bipedal_cfg from above

# fresh model for run 5
bipedal_model_run5 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters for BipedalWalker – tuned between best parts of run 2 and 3/4
bipedal_ppo_cfg_run5 = PPOUpdateConfig(
    clip_range=0.15,     # moderate clip (like run 2, smaller than run 1)
    value_coef=0.5,
    entropy_coef=0.006,  # small but non-zero exploration
    max_grad_norm=0.5,
    n_epochs=4,          # a bit fewer epochs than run 2
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run5 = "run_5_bipedal_ppo"
bipedal_run_dir_run5 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run5)
print(f"BipedalWalker PPO run 5 dir: {bipedal_run_dir_run5}")

# Training budget
bipedal_total_steps_run5 = 600_000
bipedal_rollout_len_run5 = 2048

# Train PPO
bipedal_model_run5, bipedal_episode_returns_run5, bipedal_logs_run5 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run5,
    control_type="continuous",
    run_dir=bipedal_run_dir_run5,
    total_env_steps=bipedal_total_steps_run5,
    rollout_len=bipedal_rollout_len_run5,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run5,
    lr=2.0e-4,
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run5, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run5, dtype=np.float32),
)

bipedal_model_path_run5 = os.path.join(bipedal_run_dir_run5, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run5.state_dict(), bipedal_model_path_run5)
print(f"Saved BipedalWalker PPO run 5 model to {bipedal_model_path_run5}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run5,
    run_dir=bipedal_run_dir_run5,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 5)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run5 = os.path.join(bipedal_run_dir_run5, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run5 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run5,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run5,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run5, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run5, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run5,
    run_dir=bipedal_run_dir_run5,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 5)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run5,
    control_type="continuous",
    run_dir=bipedal_run_dir_run5,
    csv_path=csv_path_bipedal_run5,
    max_steps=1600,
)
BipedalWalker PPO run 5 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_5_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-138.33 loss=182.159 pg=-0.003 vf=364.392 H=5.684 KL=0.0080 clip_frac=0.064
[PPO] it=   11 steps=   22528 avg10=-123.46 loss=3.166 pg=0.005 vf=6.391 H=5.687 KL=0.0102 clip_frac=0.157
[PPO] it=   21 steps=   43008 avg10= -95.78 loss=0.100 pg=0.002 vf=0.262 H=5.592 KL=0.0104 clip_frac=0.116
[PPO] it=   31 steps=   63488 avg10=-103.04 loss=0.851 pg=0.001 vf=1.768 H=5.593 KL=0.0097 clip_frac=0.096
[PPO] it=   41 steps=   83968 avg10= -93.50 loss=0.336 pg=0.002 vf=0.737 H=5.619 KL=0.0105 clip_frac=0.102
[PPO] it=   51 steps=  104448 avg10= -81.07 loss=1.161 pg=0.003 vf=2.384 H=5.601 KL=0.0080 clip_frac=0.078
[PPO] it=   61 steps=  124928 avg10= -92.23 loss=29.490 pg=-0.001 vf=59.049 H=5.612 KL=0.0066 clip_frac=0.052
[PPO] it=   71 steps=  145408 avg10= -92.02 loss=0.421 pg=0.002 vf=0.906 H=5.657 KL=0.0104 clip_frac=0.141
[PPO] it=   81 steps=  165888 avg10= -91.84 loss=13.995 pg=0.003 vf=28.052 H=5.644 KL=0.0057 clip_frac=0.022
[PPO] it=   91 steps=  186368 avg10= -76.08 loss=1.578 pg=0.002 vf=3.219 H=5.645 KL=0.0095 clip_frac=0.106
[PPO] it=  101 steps=  206848 avg10=-102.58 loss=22.998 pg=0.002 vf=46.060 H=5.641 KL=0.0098 clip_frac=0.133
[PPO] it=  111 steps=  227328 avg10= -79.23 loss=21.961 pg=0.003 vf=43.985 H=5.672 KL=0.0141 clip_frac=0.197
[PPO] it=  121 steps=  247808 avg10= -84.28 loss=0.357 pg=0.004 vf=0.775 H=5.709 KL=0.0162 clip_frac=0.215
[PPO] it=  131 steps=  268288 avg10=-112.54 loss=9.461 pg=0.002 vf=18.988 H=5.741 KL=0.0142 clip_frac=0.156
[PPO] it=  141 steps=  288768 avg10= -77.57 loss=10.679 pg=0.003 vf=21.421 H=5.807 KL=0.0125 clip_frac=0.123
[PPO] it=  151 steps=  309248 avg10= -87.99 loss=0.141 pg=-0.001 vf=0.353 H=5.769 KL=0.0124 clip_frac=0.177
[PPO] it=  161 steps=  329728 avg10= -78.35 loss=0.169 pg=0.004 vf=0.398 H=5.796 KL=0.0116 clip_frac=0.159
[PPO] it=  171 steps=  350208 avg10= -88.74 loss=0.322 pg=0.002 vf=0.709 H=5.780 KL=0.0108 clip_frac=0.146
[PPO] it=  181 steps=  370688 avg10= -59.63 loss=0.179 pg=0.001 vf=0.427 H=5.812 KL=0.0106 clip_frac=0.126
[PPO] it=  191 steps=  391168 avg10= -74.88 loss=12.885 pg=0.004 vf=25.832 H=5.827 KL=0.0107 clip_frac=0.098
[PPO] it=  201 steps=  411648 avg10=-114.14 loss=52.618 pg=0.003 vf=105.300 H=5.823 KL=0.0091 clip_frac=0.072
[PPO] it=  211 steps=  432128 avg10= -96.30 loss=0.184 pg=0.001 vf=0.435 H=5.807 KL=0.0119 clip_frac=0.164
[PPO] it=  221 steps=  452608 avg10= -51.89 loss=0.185 pg=-0.003 vf=0.445 H=5.783 KL=0.0108 clip_frac=0.119
[PPO] it=  231 steps=  473088 avg10= -83.61 loss=10.956 pg=-0.001 vf=21.983 H=5.770 KL=0.0086 clip_frac=0.062
[PPO] it=  241 steps=  493568 avg10= -51.43 loss=0.562 pg=-0.001 vf=1.193 H=5.738 KL=0.0098 clip_frac=0.085
[PPO] it=  251 steps=  514048 avg10= -38.51 loss=0.370 pg=-0.000 vf=0.809 H=5.702 KL=0.0087 clip_frac=0.074
[PPO] it=  261 steps=  534528 avg10= -12.32 loss=0.660 pg=-0.001 vf=1.390 H=5.695 KL=0.0084 clip_frac=0.071
[PPO] it=  271 steps=  555008 avg10= -21.57 loss=1.300 pg=0.003 vf=2.663 H=5.673 KL=0.0099 clip_frac=0.091
[PPO] it=  281 steps=  575488 avg10=   6.82 loss=1.215 pg=0.002 vf=2.496 H=5.685 KL=0.0108 clip_frac=0.138
[PPO] it=  291 steps=  595968 avg10= -24.94 loss=18.505 pg=0.001 vf=37.076 H=5.684 KL=0.0210 clip_frac=0.089
[PPO] it=  293 steps=  600000 avg10= -22.32 loss=0.681 pg=0.003 vf=1.425 H=5.676 KL=0.0124 clip_frac=0.114
[PPO] done steps=600000 time=771.3s avg10=-22.32
Saved BipedalWalker PPO run 5 model to a3_bonus_ppo_artifacts/bipedal_walker/run_5_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_5_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 68.78 steps 1600
Eval episode 2 seed 1228 return 69.23 steps 1600
Eval episode 3 seed 1229 return 64.21 steps 1600
Eval episode 4 seed 1230 return 81.36 steps 1600
Eval episode 5 seed 1231 return 74.66 steps 1600
Eval episode 6 seed 1232 return 61.47 steps 1600
Eval episode 7 seed 1233 return 72.36 steps 1600
Eval episode 8 seed 1234 return 73.17 steps 1600
Eval episode 9 seed 1235 return 76.55 steps 1600
Eval episode 10 seed 1236 return 81.41 steps 1600
Greedy evaluation mean 72.32  std 6.28
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_5_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_5_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=10, seed=1236, return=81.41, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_5_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 81.41 steps 1600 with seed 1236 into a3_bonus_ppo_artifacts/bipedal_walker/run_5_bipedal_ppo/videos
Replayed best episode for video: return=81.41, steps=1600

Run#6

In [ ]:
# reuse obs_dim_bipedal, act_dim_bipedal, and bipedal_cfg from above

# fresh model for run 6
bipedal_model_run6 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters – between the “good” run 5 and the conservative runs
bipedal_ppo_cfg_run6 = PPOUpdateConfig(
    clip_range=0.12,     # a bit gentler than run 5
    value_coef=0.5,
    entropy_coef=0.006,  # keep some exploration noise
    max_grad_norm=0.5,
    n_epochs=3,          # fewer epochs than run 5 to avoid over-updating
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run6 = "run_6_bipedal_ppo"
bipedal_run_dir_run6 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run6)
print(f"BipedalWalker PPO run 6 dir: {bipedal_run_dir_run6}")

# Training budget – a bit longer than run 5 but not huge
bipedal_total_steps_run6 = 700_000
bipedal_rollout_len_run6 = 2048

# Train PPO
bipedal_model_run6, bipedal_episode_returns_run6, bipedal_logs_run6 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run6,
    control_type="continuous",
    run_dir=bipedal_run_dir_run6,
    total_env_steps=bipedal_total_steps_run6,
    rollout_len=bipedal_rollout_len_run6,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run6,
    lr=1.5e-4,          # slightly smaller LR than run 5
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run6, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run6, dtype=np.float32),
)

bipedal_model_path_run6 = os.path.join(bipedal_run_dir_run6, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run6.state_dict(), bipedal_model_path_run6)
print(f"Saved BipedalWalker PPO run 6 model to {bipedal_model_path_run6}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run6,
    run_dir=bipedal_run_dir_run6,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 6)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run6 = os.path.join(bipedal_run_dir_run6, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run6 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run6,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run6,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run6, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run6, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run6,
    run_dir=bipedal_run_dir_run6,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 6)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run6,
    control_type="continuous",
    run_dir=bipedal_run_dir_run6,
    csv_path=csv_path_bipedal_run6,
    max_steps=1600,
)
BipedalWalker PPO run 6 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_6_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-107.10 loss=5.344 pg=-0.002 vf=10.760 H=5.674 KL=0.0060 clip_frac=0.046
[PPO] it=   11 steps=   22528 avg10=-107.90 loss=62.402 pg=0.002 vf=124.867 H=5.678 KL=0.0076 clip_frac=0.088
[PPO] it=   21 steps=   43008 avg10=-104.83 loss=52.413 pg=0.002 vf=104.890 H=5.679 KL=0.0072 clip_frac=0.096
[PPO] it=   31 steps=   63488 avg10=-110.83 loss=27.090 pg=0.005 vf=54.237 H=5.627 KL=0.0065 clip_frac=0.044
[PPO] it=   41 steps=   83968 avg10=-106.06 loss=0.362 pg=0.000 vf=0.791 H=5.595 KL=0.0064 clip_frac=0.065
[PPO] it=   51 steps=  104448 avg10= -95.95 loss=0.131 pg=0.000 vf=0.328 H=5.554 KL=0.0068 clip_frac=0.075
[PPO] it=   61 steps=  124928 avg10=-103.25 loss=0.413 pg=-0.002 vf=0.896 H=5.550 KL=0.0084 clip_frac=0.106
[PPO] it=   71 steps=  145408 avg10=-112.11 loss=0.076 pg=0.000 vf=0.217 H=5.518 KL=0.0085 clip_frac=0.098
[PPO] it=   81 steps=  165888 avg10=-108.52 loss=0.152 pg=-0.001 vf=0.371 H=5.492 KL=0.0050 clip_frac=0.054
[PPO] it=   91 steps=  186368 avg10= -83.96 loss=0.040 pg=-0.002 vf=0.148 H=5.478 KL=0.0053 clip_frac=0.065
[PPO] it=  101 steps=  206848 avg10=-119.20 loss=41.242 pg=0.001 vf=82.548 H=5.462 KL=0.0045 clip_frac=0.031
[PPO] it=  111 steps=  227328 avg10=-149.32 loss=0.106 pg=-0.000 vf=0.277 H=5.453 KL=0.0053 clip_frac=0.059
[PPO] it=  121 steps=  247808 avg10=-120.95 loss=27.141 pg=0.003 vf=54.342 H=5.465 KL=0.0101 clip_frac=0.196
[PPO] it=  131 steps=  268288 avg10=-120.25 loss=24.867 pg=0.010 vf=49.778 H=5.461 KL=0.0137 clip_frac=0.150
[PPO] it=  141 steps=  288768 avg10=-117.64 loss=1.138 pg=0.007 vf=2.329 H=5.435 KL=0.0098 clip_frac=0.147
[PPO] it=  151 steps=  309248 avg10=-148.36 loss=34.975 pg=0.003 vf=70.008 H=5.430 KL=0.0079 clip_frac=0.095
[PPO] it=  161 steps=  329728 avg10=-144.07 loss=39.769 pg=0.001 vf=79.601 H=5.435 KL=0.0078 clip_frac=0.095
[PPO] it=  171 steps=  350208 avg10=-151.05 loss=35.286 pg=-0.001 vf=70.640 H=5.433 KL=0.0070 clip_frac=0.066
[PPO] it=  181 steps=  370688 avg10=-108.05 loss=0.132 pg=0.002 vf=0.326 H=5.448 KL=0.0071 clip_frac=0.063
[PPO] it=  191 steps=  391168 avg10= -89.95 loss=0.081 pg=0.001 vf=0.226 H=5.450 KL=0.0047 clip_frac=0.024
[PPO] it=  201 steps=  411648 avg10= -84.38 loss=0.125 pg=-0.002 vf=0.319 H=5.471 KL=0.0061 clip_frac=0.055
[PPO] it=  211 steps=  432128 avg10= -79.81 loss=0.109 pg=0.001 vf=0.282 H=5.457 KL=0.0059 clip_frac=0.037
[PPO] it=  221 steps=  452608 avg10= -98.04 loss=16.417 pg=0.000 vf=32.898 H=5.456 KL=0.0065 clip_frac=0.052
[PPO] it=  231 steps=  473088 avg10= -68.00 loss=0.083 pg=-0.001 vf=0.233 H=5.420 KL=0.0055 clip_frac=0.061
[PPO] it=  241 steps=  493568 avg10= -57.18 loss=0.201 pg=-0.001 vf=0.469 H=5.399 KL=0.0057 clip_frac=0.042
[PPO] it=  251 steps=  514048 avg10= -48.98 loss=0.410 pg=0.001 vf=0.883 H=5.389 KL=0.0052 clip_frac=0.042
[PPO] it=  261 steps=  534528 avg10= -33.45 loss=0.153 pg=0.001 vf=0.369 H=5.366 KL=0.0052 clip_frac=0.042
[PPO] it=  271 steps=  555008 avg10= -32.09 loss=0.444 pg=0.001 vf=0.949 H=5.376 KL=0.0074 clip_frac=0.104
[PPO] it=  281 steps=  575488 avg10= -16.32 loss=0.347 pg=-0.000 vf=0.760 H=5.338 KL=0.0082 clip_frac=0.118
[PPO] it=  291 steps=  595968 avg10= -85.93 loss=26.058 pg=0.034 vf=52.111 H=5.343 KL=0.0349 clip_frac=0.246
[PPO] it=  301 steps=  616448 avg10=-156.03 loss=0.088 pg=0.008 vf=0.225 H=5.347 KL=0.0209 clip_frac=0.416
[PPO] it=  311 steps=  636928 avg10=-139.39 loss=11.262 pg=0.006 vf=22.577 H=5.346 KL=0.0152 clip_frac=0.250
[PPO] it=  321 steps=  657408 avg10=-157.49 loss=0.181 pg=0.009 vf=0.409 H=5.379 KL=0.0177 clip_frac=0.356
[PPO] it=  331 steps=  677888 avg10=-166.22 loss=0.022 pg=0.009 vf=0.090 H=5.407 KL=0.0167 clip_frac=0.352
[PPO] it=  341 steps=  698368 avg10=-156.82 loss=25.145 pg=0.033 vf=50.291 H=5.447 KL=0.0192 clip_frac=0.288
[PPO] it=  342 steps=  700000 avg10=-156.72 loss=0.695 pg=0.007 vf=1.441 H=5.450 KL=0.0132 clip_frac=0.240
[PPO] done steps=700000 time=1032.9s avg10=-156.72
Saved BipedalWalker PPO run 6 model to a3_bonus_ppo_artifacts/bipedal_walker/run_6_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_6_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -166.01 steps 1600
Eval episode 2 seed 1228 return -166.60 steps 1600
Eval episode 3 seed 1229 return -166.56 steps 1600
Eval episode 4 seed 1230 return -166.58 steps 1600
Eval episode 5 seed 1231 return -166.46 steps 1600
Eval episode 6 seed 1232 return -163.88 steps 1600
Eval episode 7 seed 1233 return -166.56 steps 1600
Eval episode 8 seed 1234 return -166.14 steps 1600
Eval episode 9 seed 1235 return -166.52 steps 1600
Eval episode 10 seed 1236 return -166.35 steps 1600
Greedy evaluation mean -166.17  std 0.78
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_6_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_6_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=6, seed=1232, return=-163.88, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_6_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -163.88 steps 1600 with seed 1232 into a3_bonus_ppo_artifacts/bipedal_walker/run_6_bipedal_ppo/videos
Replayed best episode for video: return=-163.88, steps=1600

Run#7

In [ ]:
# reuse obs_dim_bipedal, act_dim_bipedal, and bipedal_cfg from above

# fresh model for run 7
bipedal_model_run7 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters – same shape as run 5 (the best so far)
bipedal_ppo_cfg_run7 = PPOUpdateConfig(
    clip_range=0.15,     # moderate clipping
    value_coef=0.5,
    entropy_coef=0.006,  # small exploration noise
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run7 = "run_7_bipedal_ppo"
bipedal_run_dir_run7 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run7)
print(f"BipedalWalker PPO run 7 dir: {bipedal_run_dir_run7}")

# Training budget – a bit shorter to reduce late drift
bipedal_total_steps_run7 = 500_000
bipedal_rollout_len_run7 = 2048

# Train PPO
bipedal_model_run7, bipedal_episode_returns_run7, bipedal_logs_run7 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run7,
    control_type="continuous",
    run_dir=bipedal_run_dir_run7,
    total_env_steps=bipedal_total_steps_run7,
    rollout_len=bipedal_rollout_len_run7,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run7,
    lr=2.0e-4,
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run7, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run7, dtype=np.float32),
)

bipedal_model_path_run7 = os.path.join(bipedal_run_dir_run7, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run7.state_dict(), bipedal_model_path_run7)
print(f"Saved BipedalWalker PPO run 7 model to {bipedal_model_path_run7}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run7,
    run_dir=bipedal_run_dir_run7,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 7)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run7 = os.path.join(bipedal_run_dir_run7, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run7 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run7,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run7,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run7, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run7, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run7,
    run_dir=bipedal_run_dir_run7,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 7)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run7,
    control_type="continuous",
    run_dir=bipedal_run_dir_run7,
    csv_path=csv_path_bipedal_run7,
    max_steps=1600,
)
BipedalWalker PPO run 7 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_7_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-105.27 loss=39.787 pg=-0.003 vf=79.649 H=5.685 KL=0.0114 clip_frac=0.134
[PPO] it=   11 steps=   22528 avg10=-108.76 loss=46.372 pg=0.001 vf=92.810 H=5.683 KL=0.0092 clip_frac=0.100
[PPO] it=   21 steps=   43008 avg10= -97.00 loss=29.730 pg=0.010 vf=59.509 H=5.660 KL=0.0101 clip_frac=0.097
[PPO] it=   31 steps=   63488 avg10= -79.61 loss=0.991 pg=0.003 vf=2.044 H=5.647 KL=0.0090 clip_frac=0.128
[PPO] it=   41 steps=   83968 avg10= -77.20 loss=0.070 pg=-0.003 vf=0.213 H=5.606 KL=0.0095 clip_frac=0.122
[PPO] it=   51 steps=  104448 avg10= -74.99 loss=0.117 pg=-0.002 vf=0.304 H=5.621 KL=0.0079 clip_frac=0.083
[PPO] it=   61 steps=  124928 avg10= -69.30 loss=0.184 pg=-0.000 vf=0.435 H=5.613 KL=0.0076 clip_frac=0.067
[PPO] it=   71 steps=  145408 avg10= -57.73 loss=0.113 pg=-0.003 vf=0.299 H=5.575 KL=0.0093 clip_frac=0.079
[PPO] it=   81 steps=  165888 avg10= -29.86 loss=0.379 pg=0.000 vf=0.825 H=5.542 KL=0.0078 clip_frac=0.083
[PPO] it=   91 steps=  186368 avg10= -18.80 loss=0.249 pg=-0.003 vf=0.569 H=5.518 KL=0.0106 clip_frac=0.108
[PPO] it=  101 steps=  206848 avg10=  -1.62 loss=0.136 pg=-0.002 vf=0.341 H=5.465 KL=0.0095 clip_frac=0.102
[PPO] it=  111 steps=  227328 avg10=   9.00 loss=0.246 pg=-0.001 vf=0.559 H=5.387 KL=0.0086 clip_frac=0.097
[PPO] it=  121 steps=  247808 avg10=  37.16 loss=0.389 pg=-0.001 vf=0.844 H=5.384 KL=0.0101 clip_frac=0.111
[PPO] it=  131 steps=  268288 avg10=  39.54 loss=43.134 pg=0.030 vf=86.273 H=5.356 KL=1.0694 clip_frac=0.420
[PPO] it=  141 steps=  288768 avg10=  67.45 loss=39.831 pg=0.009 vf=79.708 H=5.318 KL=0.6173 clip_frac=0.361
[PPO] it=  151 steps=  309248 avg10=  71.01 loss=0.859 pg=-0.000 vf=1.782 H=5.331 KL=0.0114 clip_frac=0.140
[PPO] it=  161 steps=  329728 avg10=  99.50 loss=0.654 pg=0.014 vf=1.344 H=5.308 KL=0.0173 clip_frac=0.216
[PPO] it=  171 steps=  350208 avg10= 122.31 loss=0.394 pg=-0.001 vf=0.853 H=5.303 KL=0.0116 clip_frac=0.148
[PPO] it=  181 steps=  370688 avg10= 123.99 loss=0.666 pg=-0.000 vf=1.396 H=5.307 KL=0.0115 clip_frac=0.171
[PPO] it=  191 steps=  391168 avg10= 111.08 loss=10.968 pg=-0.016 vf=22.032 H=5.277 KL=1.6044 clip_frac=0.286
[PPO] it=  201 steps=  411648 avg10= 127.49 loss=0.461 pg=0.001 vf=0.984 H=5.266 KL=0.0149 clip_frac=0.200
[PPO] it=  211 steps=  432128 avg10=  96.64 loss=0.504 pg=0.012 vf=1.049 H=5.307 KL=0.0189 clip_frac=0.240
[PPO] it=  221 steps=  452608 avg10=  64.85 loss=14.418 pg=-0.005 vf=28.910 H=5.313 KL=6.8645 clip_frac=0.314
[PPO] it=  231 steps=  473088 avg10=  84.69 loss=1.413 pg=0.016 vf=2.858 H=5.309 KL=0.0303 clip_frac=0.241
[PPO] it=  241 steps=  493568 avg10=  80.68 loss=1.240 pg=0.002 vf=2.540 H=5.290 KL=0.0186 clip_frac=0.198
[PPO] it=  245 steps=  500000 avg10=  92.68 loss=0.760 pg=-0.001 vf=1.586 H=5.288 KL=0.0154 clip_frac=0.151
[PPO] done steps=500000 time=673.6s avg10=92.68
Saved BipedalWalker PPO run 7 model to a3_bonus_ppo_artifacts/bipedal_walker/run_7_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_7_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -48.44 steps 965
Eval episode 2 seed 1228 return 126.08 steps 1600
Eval episode 3 seed 1229 return 75.18 steps 1600
Eval episode 4 seed 1230 return 80.63 steps 1600
Eval episode 5 seed 1231 return 92.48 steps 1600
Eval episode 6 seed 1232 return 108.26 steps 1600
Eval episode 7 seed 1233 return 82.77 steps 1600
Eval episode 8 seed 1234 return 102.91 steps 1600
Eval episode 9 seed 1235 return 85.10 steps 1600
Eval episode 10 seed 1236 return 47.62 steps 1600
Greedy evaluation mean 75.26  std 45.80
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_7_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_7_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=2, seed=1228, return=126.08, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_7_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 126.08 steps 1600 with seed 1228 into a3_bonus_ppo_artifacts/bipedal_walker/run_7_bipedal_ppo/videos
Replayed best episode for video: return=126.08, steps=1600

Run#8

In [ ]:
# reuse obs_dim_bipedal, act_dim_bipedal, and bipedal_cfg from above

# fresh model for run 8
bipedal_model_run8 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters – slight tweak of run 7
bipedal_ppo_cfg_run8 = PPOUpdateConfig(
    clip_range=0.12,     # a bit tighter than 0.15 to avoid big policy jumps
    value_coef=0.5,
    entropy_coef=0.005,  # slightly less entropy; policy already explores enough
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run8 = "run_8_bipedal_ppo"
bipedal_run_dir_run8 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run8)
print(f"BipedalWalker PPO run 8 dir: {bipedal_run_dir_run8}")

# Training budget – keep similar to run 7
bipedal_total_steps_run8 = 700_000
bipedal_rollout_len_run8 = 2048

# Train PPO
bipedal_model_run8, bipedal_episode_returns_run8, bipedal_logs_run8 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run8,
    control_type="continuous",
    run_dir=bipedal_run_dir_run8,
    total_env_steps=bipedal_total_steps_run8,
    rollout_len=bipedal_rollout_len_run8,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run8,
    lr=1.5e-4,           # a bit smaller than 2e-4 for gentler updates
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run8, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run8, dtype=np.float32),
)

bipedal_model_path_run8 = os.path.join(bipedal_run_dir_run8, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run8.state_dict(), bipedal_model_path_run8)
print(f"Saved BipedalWalker PPO run 8 model to {bipedal_model_path_run8}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run8,
    run_dir=bipedal_run_dir_run8,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 8)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run8 = os.path.join(bipedal_run_dir_run8, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run8 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run8,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run8,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run8, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run8, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run8,
    run_dir=bipedal_run_dir_run8,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 8)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run8,
    control_type="continuous",
    run_dir=bipedal_run_dir_run8,
    csv_path=csv_path_bipedal_run8,
    max_steps=1600,
)
BipedalWalker PPO run 8 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_8_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-104.71 loss=241.346 pg=-0.002 vf=482.751 H=5.674 KL=0.0050 clip_frac=0.046
[PPO] it=   11 steps=   22528 avg10=-110.53 loss=34.727 pg=0.005 vf=69.501 H=5.697 KL=0.0116 clip_frac=0.151
[PPO] it=   21 steps=   43008 avg10=-114.13 loss=0.918 pg=0.003 vf=1.887 H=5.756 KL=0.0110 clip_frac=0.218
[PPO] it=   31 steps=   63488 avg10=-107.48 loss=0.219 pg=0.003 vf=0.489 H=5.715 KL=0.0094 clip_frac=0.225
[PPO] it=   41 steps=   83968 avg10=-101.29 loss=1.354 pg=0.000 vf=2.764 H=5.746 KL=0.0066 clip_frac=0.090
[PPO] it=   51 steps=  104448 avg10= -99.99 loss=0.826 pg=-0.000 vf=1.709 H=5.762 KL=0.0068 clip_frac=0.106
[PPO] it=   61 steps=  124928 avg10= -94.61 loss=0.039 pg=-0.002 vf=0.139 H=5.744 KL=0.0078 clip_frac=0.110
[PPO] it=   71 steps=  145408 avg10= -90.63 loss=0.050 pg=-0.001 vf=0.160 H=5.723 KL=0.0065 clip_frac=0.081
[PPO] it=   81 steps=  165888 avg10= -81.85 loss=0.071 pg=-0.000 vf=0.199 H=5.675 KL=0.0076 clip_frac=0.075
[PPO] it=   91 steps=  186368 avg10= -75.29 loss=0.065 pg=-0.001 vf=0.188 H=5.630 KL=0.0075 clip_frac=0.105
[PPO] it=  101 steps=  206848 avg10= -75.71 loss=0.078 pg=-0.001 vf=0.215 H=5.620 KL=0.0080 clip_frac=0.090
[PPO] it=  111 steps=  227328 avg10= -84.68 loss=18.206 pg=0.011 vf=36.446 H=5.624 KL=0.0163 clip_frac=0.265
[PPO] it=  121 steps=  247808 avg10= -70.69 loss=0.154 pg=0.003 vf=0.359 H=5.601 KL=0.0079 clip_frac=0.120
[PPO] it=  131 steps=  268288 avg10= -76.84 loss=1.127 pg=0.004 vf=2.301 H=5.579 KL=0.0112 clip_frac=0.189
[PPO] it=  141 steps=  288768 avg10= -79.35 loss=0.238 pg=0.006 vf=0.521 H=5.585 KL=0.0103 clip_frac=0.197
[PPO] it=  151 steps=  309248 avg10= -58.29 loss=0.057 pg=-0.003 vf=0.175 H=5.567 KL=0.0071 clip_frac=0.107
[PPO] it=  161 steps=  329728 avg10= -73.70 loss=31.461 pg=0.005 vf=62.967 H=5.564 KL=0.0125 clip_frac=0.202
[PPO] it=  171 steps=  350208 avg10= -58.54 loss=0.533 pg=0.009 vf=1.104 H=5.556 KL=0.0114 clip_frac=0.182
[PPO] it=  181 steps=  370688 avg10= -76.38 loss=14.841 pg=-0.000 vf=29.739 H=5.554 KL=0.0068 clip_frac=0.077
[PPO] it=  191 steps=  391168 avg10= -62.63 loss=0.415 pg=-0.001 vf=0.888 H=5.538 KL=0.0069 clip_frac=0.079
[PPO] it=  201 steps=  411648 avg10= -40.85 loss=0.168 pg=0.001 vf=0.390 H=5.488 KL=0.0089 clip_frac=0.127
[PPO] it=  211 steps=  432128 avg10= -36.97 loss=0.473 pg=0.002 vf=0.996 H=5.467 KL=0.0089 clip_frac=0.163
[PPO] it=  221 steps=  452608 avg10= -28.53 loss=32.723 pg=0.006 vf=65.489 H=5.410 KL=0.0097 clip_frac=0.117
[PPO] it=  231 steps=  473088 avg10= -40.66 loss=0.269 pg=0.001 vf=0.589 H=5.385 KL=0.0088 clip_frac=0.127
[PPO] it=  241 steps=  493568 avg10= -92.37 loss=67.380 pg=0.013 vf=134.787 H=5.387 KL=0.0149 clip_frac=0.143
[PPO] it=  251 steps=  514048 avg10=-124.79 loss=6.502 pg=0.003 vf=13.053 H=5.385 KL=0.0102 clip_frac=0.182
[PPO] it=  261 steps=  534528 avg10=-124.43 loss=0.211 pg=0.007 vf=0.462 H=5.395 KL=0.0205 clip_frac=0.395
[PPO] it=  271 steps=  555008 avg10= -91.90 loss=39.632 pg=0.010 vf=79.298 H=5.408 KL=0.0129 clip_frac=0.181
[PPO] it=  281 steps=  575488 avg10=-111.53 loss=25.493 pg=0.025 vf=50.990 H=5.423 KL=0.0334 clip_frac=0.534
[PPO] it=  291 steps=  595968 avg10= -93.14 loss=15.272 pg=0.007 vf=30.584 H=5.421 KL=0.0123 clip_frac=0.158
[PPO] it=  301 steps=  616448 avg10= -13.74 loss=30.062 pg=0.001 vf=60.176 H=5.400 KL=0.0093 clip_frac=0.073
[PPO] it=  311 steps=  636928 avg10=  -8.76 loss=2.773 pg=0.006 vf=5.588 H=5.403 KL=0.0127 clip_frac=0.190
[PPO] it=  321 steps=  657408 avg10=  -3.52 loss=0.160 pg=-0.003 vf=0.379 H=5.360 KL=0.0091 clip_frac=0.146
[PPO] it=  331 steps=  677888 avg10= -65.27 loss=22.140 pg=0.025 vf=44.282 H=5.344 KL=0.0217 clip_frac=0.228
[PPO] it=  341 steps=  698368 avg10= -39.05 loss=17.419 pg=-0.000 vf=34.891 H=5.338 KL=0.0100 clip_frac=0.139
[PPO] it=  342 steps=  700000 avg10= -26.50 loss=1.096 pg=0.005 vf=2.235 H=5.335 KL=0.0104 clip_frac=0.199
[PPO] done steps=700000 time=1090.3s avg10=-26.50
Saved BipedalWalker PPO run 8 model to a3_bonus_ppo_artifacts/bipedal_walker/run_8_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_8_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 89.07 steps 1600
Eval episode 2 seed 1228 return 87.25 steps 1600
Eval episode 3 seed 1229 return 88.68 steps 1600
Eval episode 4 seed 1230 return 83.45 steps 1600
Eval episode 5 seed 1231 return 90.37 steps 1600
Eval episode 6 seed 1232 return 73.60 steps 1600
Eval episode 7 seed 1233 return 88.16 steps 1600
Eval episode 8 seed 1234 return 91.63 steps 1600
Eval episode 9 seed 1235 return 83.54 steps 1600
Eval episode 10 seed 1236 return 83.77 steps 1600
Greedy evaluation mean 85.95  std 4.95
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_8_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_8_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=8, seed=1234, return=91.63, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_8_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 91.63 steps 1600 with seed 1234 into a3_bonus_ppo_artifacts/bipedal_walker/run_8_bipedal_ppo/videos
Replayed best episode for video: return=91.63, steps=1600

Run#9

In [ ]:
# fresh model for run 9 (reuse the same config defined earlier: bipedal_cfg)
bipedal_model_run9 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters for BipedalWalker (more conservative than run 7)
bipedal_ppo_cfg_run9 = PPOUpdateConfig(
    clip_range=0.10,     # smaller update radius than 0.12
    value_coef=0.5,
    entropy_coef=0.006,  # a bit of exploration but not too much
    max_grad_norm=0.5,
    n_epochs=4,          # fewer passes over each batch to reduce overfitting / KL spikes
    batch_size=256,
    normalize_adv=True,
)

# run name / directory
bipedal_run_name_run9 = "run_9_bipedal_ppo"
bipedal_run_dir_run9 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run9)
print(f"BipedalWalker PPO run 9 dir: {bipedal_run_dir_run9}")

# training budget
bipedal_total_steps_run9 = 700_000
bipedal_rollout_len_run9 = 2048

# Train PPO
bipedal_model_run9, bipedal_episode_returns_run9, bipedal_logs_run9 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run9,
    control_type="continuous",
    run_dir=bipedal_run_dir_run9,
    total_env_steps=bipedal_total_steps_run9,
    rollout_len=bipedal_rollout_len_run9,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run9,
    lr=1e-4,             # more gentle LR
    log_every=20_000,
)

# save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run9, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run9, dtype=np.float32),
)

bipedal_model_path_run9 = os.path.join(bipedal_run_dir_run9, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run9.state_dict(), bipedal_model_path_run9)
print(f"Saved BipedalWalker PPO run 9 model to {bipedal_model_path_run9}")

# training curve
plot_rewards(
    rewards=bipedal_episode_returns_run9,
    run_dir=bipedal_run_dir_run9,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 9)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run9 = os.path.join(bipedal_run_dir_run9, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run9 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run9,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run9,
)

# save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run9, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run9, dtype=np.float32),
)

# eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run9,
    run_dir=bipedal_run_dir_run9,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 9)",
    ma_window=3,
)

# Record video of the best greedy episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run9,
    control_type="continuous",
    run_dir=bipedal_run_dir_run9,
    csv_path=csv_path_bipedal_run9,
    max_steps=1600,
)
BipedalWalker PPO run 9 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_9_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-103.46 loss=166.417 pg=-0.002 vf=332.904 H=5.674 KL=0.0012 clip_frac=0.001
[PPO] it=   11 steps=   22528 avg10=-118.63 loss=57.044 pg=0.001 vf=114.155 H=5.683 KL=0.0035 clip_frac=0.102
[PPO] it=   21 steps=   43008 avg10=-103.77 loss=112.183 pg=-0.001 vf=224.437 H=5.695 KL=0.0022 clip_frac=0.040
[PPO] it=   31 steps=   63488 avg10=-109.13 loss=159.643 pg=-0.000 vf=319.355 H=5.702 KL=0.0015 clip_frac=0.023
[PPO] it=   41 steps=   83968 avg10=-105.41 loss=83.249 pg=-0.001 vf=166.569 H=5.705 KL=0.0022 clip_frac=0.069
[PPO] it=   51 steps=  104448 avg10=-102.43 loss=34.525 pg=-0.000 vf=69.119 H=5.718 KL=0.0013 clip_frac=0.011
[PPO] it=   61 steps=  124928 avg10=-114.14 loss=285.361 pg=0.006 vf=570.780 H=5.734 KL=0.0073 clip_frac=0.260
[PPO] it=   71 steps=  145408 avg10= -96.68 loss=61.542 pg=0.000 vf=123.153 H=5.737 KL=0.0025 clip_frac=0.042
[PPO] it=   81 steps=  165888 avg10= -96.11 loss=20.473 pg=-0.000 vf=41.016 H=5.744 KL=0.0030 clip_frac=0.058
[PPO] it=   91 steps=  186368 avg10=-106.81 loss=28.712 pg=-0.001 vf=57.495 H=5.754 KL=0.0023 clip_frac=0.046
[PPO] it=  101 steps=  206848 avg10= -88.66 loss=34.780 pg=0.007 vf=69.616 H=5.764 KL=0.0049 clip_frac=0.229
[PPO] it=  111 steps=  227328 avg10= -93.71 loss=0.222 pg=-0.001 vf=0.515 H=5.739 KL=0.0026 clip_frac=0.050
[PPO] it=  121 steps=  247808 avg10= -92.95 loss=15.897 pg=0.014 vf=31.835 H=5.745 KL=0.0105 clip_frac=0.374
[PPO] it=  131 steps=  268288 avg10= -86.56 loss=0.152 pg=-0.000 vf=0.373 H=5.727 KL=0.0015 clip_frac=0.010
[PPO] it=  141 steps=  288768 avg10= -87.48 loss=0.099 pg=-0.002 vf=0.270 H=5.718 KL=0.0017 clip_frac=0.066
[PPO] it=  151 steps=  309248 avg10= -89.62 loss=0.213 pg=-0.001 vf=0.497 H=5.708 KL=0.0021 clip_frac=0.066
[PPO] it=  161 steps=  329728 avg10= -94.72 loss=0.182 pg=-0.001 vf=0.434 H=5.701 KL=0.0014 clip_frac=0.038
[PPO] it=  171 steps=  350208 avg10= -85.29 loss=0.162 pg=-0.000 vf=0.394 H=5.689 KL=0.0025 clip_frac=0.030
[PPO] it=  181 steps=  370688 avg10= -84.02 loss=0.196 pg=-0.001 vf=0.461 H=5.691 KL=0.0021 clip_frac=0.034
[PPO] it=  191 steps=  391168 avg10= -76.07 loss=0.258 pg=-0.001 vf=0.586 H=5.668 KL=0.0021 clip_frac=0.055
[PPO] it=  201 steps=  411648 avg10= -75.56 loss=0.159 pg=0.000 vf=0.386 H=5.659 KL=0.0028 clip_frac=0.037
[PPO] it=  211 steps=  432128 avg10= -87.64 loss=16.408 pg=0.012 vf=32.859 H=5.651 KL=0.0033 clip_frac=0.135
[PPO] it=  221 steps=  452608 avg10= -87.05 loss=1.162 pg=0.002 vf=2.387 H=5.644 KL=0.0034 clip_frac=0.121
[PPO] it=  231 steps=  473088 avg10= -73.47 loss=0.090 pg=-0.001 vf=0.250 H=5.636 KL=0.0020 clip_frac=0.045
[PPO] it=  241 steps=  493568 avg10= -76.41 loss=0.060 pg=-0.002 vf=0.191 H=5.630 KL=0.0030 clip_frac=0.042
[PPO] it=  251 steps=  514048 avg10= -72.88 loss=0.076 pg=-0.002 vf=0.224 H=5.627 KL=0.0036 clip_frac=0.065
[PPO] it=  261 steps=  534528 avg10= -72.32 loss=16.282 pg=0.005 vf=32.622 H=5.628 KL=0.0036 clip_frac=0.097
[PPO] it=  271 steps=  555008 avg10= -76.90 loss=0.071 pg=-0.000 vf=0.210 H=5.622 KL=0.0014 clip_frac=0.021
[PPO] it=  281 steps=  575488 avg10= -63.67 loss=0.134 pg=-0.003 vf=0.343 H=5.618 KL=0.0030 clip_frac=0.099
[PPO] it=  291 steps=  595968 avg10= -57.43 loss=0.176 pg=-0.002 vf=0.423 H=5.606 KL=0.0029 clip_frac=0.054
[PPO] it=  301 steps=  616448 avg10= -85.04 loss=18.636 pg=0.018 vf=37.304 H=5.609 KL=0.0207 clip_frac=0.459
[PPO] it=  311 steps=  636928 avg10= -56.25 loss=0.271 pg=-0.002 vf=0.612 H=5.615 KL=0.0025 clip_frac=0.070
[PPO] it=  321 steps=  657408 avg10= -47.52 loss=0.089 pg=-0.002 vf=0.248 H=5.606 KL=0.0021 clip_frac=0.044
[PPO] it=  331 steps=  677888 avg10= -35.05 loss=0.631 pg=-0.001 vf=1.330 H=5.594 KL=0.0019 clip_frac=0.050
[PPO] it=  341 steps=  698368 avg10= -26.39 loss=0.182 pg=-0.002 vf=0.436 H=5.590 KL=0.0021 clip_frac=0.040
[PPO] it=  342 steps=  700000 avg10= -26.95 loss=0.192 pg=0.003 vf=0.446 H=5.590 KL=0.0022 clip_frac=0.032
[PPO] done steps=700000 time=944.2s avg10=-26.95
Saved BipedalWalker PPO run 9 model to a3_bonus_ppo_artifacts/bipedal_walker/run_9_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_9_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 39.83 steps 1600
Eval episode 2 seed 1228 return 47.57 steps 1600
Eval episode 3 seed 1229 return 48.89 steps 1600
Eval episode 4 seed 1230 return 40.58 steps 1600
Eval episode 5 seed 1231 return 43.79 steps 1600
Eval episode 6 seed 1232 return 53.63 steps 1600
Eval episode 7 seed 1233 return 50.38 steps 1600
Eval episode 8 seed 1234 return 42.42 steps 1600
Eval episode 9 seed 1235 return 31.71 steps 1600
Eval episode 10 seed 1236 return 42.40 steps 1600
Greedy evaluation mean 44.12  std 5.96
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_9_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_9_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=6, seed=1232, return=53.63, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_9_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 53.63 steps 1600 with seed 1232 into a3_bonus_ppo_artifacts/bipedal_walker/run_9_bipedal_ppo/videos
Replayed best episode for video: return=53.63, steps=1600

Run#10

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model configuration (same as successful runs)
bipedal_cfg_run10 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run10 = build_ppo_continuous_model_from_config(
    bipedal_cfg_run10
).to(device)

# PPO hyperparameters for BipedalWalker – "run 7 but gentler"
bipedal_ppo_cfg_run10 = PPOUpdateConfig(
    clip_range=0.12,     # was larger in run 7 → shrink to avoid destructive updates
    value_coef=0.5,
    entropy_coef=0.005,  # slightly less exploration than run 7 (0.01)
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run10 = "run_10_bipedal_ppo"
bipedal_run_dir_run10 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run10)
print(f"BipedalWalker PPO run 10 dir: {bipedal_run_dir_run10}")

# Training budget (similar scale to run 7)
bipedal_total_steps_run10 = 550_000
bipedal_rollout_len_run10 = 2048

# Train PPO on BipedalWalker
bipedal_model_run10, bipedal_episode_returns_run10, bipedal_logs_run10 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run10,
    control_type="continuous",
    run_dir=bipedal_run_dir_run10,
    total_env_steps=bipedal_total_steps_run10,
    rollout_len=bipedal_rollout_len_run10,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run10,
    lr=1e-4,             # smaller than run 7 for smoother late training
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run10, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run10, dtype=np.float32),
)

bipedal_model_path_run10 = os.path.join(bipedal_run_dir_run10, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run10.state_dict(), bipedal_model_path_run10)
print(f"Saved BipedalWalker PPO run 10 model to {bipedal_model_path_run10}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run10,
    run_dir=bipedal_run_dir_run10,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 10)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run10 = os.path.join(bipedal_run_dir_run10, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run10 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run10,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run10,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run10, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run10, dtype=np.float32),
)

# Eval plot (same style as other envs)
plot_rewards(
    rewards=bipedal_eval_returns_run10,
    run_dir=bipedal_run_dir_run10,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 10)",
)

# Record video of the *best* greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run10,
    control_type="continuous",
    run_dir=bipedal_run_dir_run10,
    csv_path=csv_path_bipedal_run10,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 10 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_10_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-122.44 loss=9.564 pg=-0.001 vf=19.186 H=5.676 KL=0.0056 clip_frac=0.053
[PPO] it=    6 steps=   12288 avg10=-107.59 loss=39.051 pg=0.002 vf=78.154 H=5.663 KL=0.0065 clip_frac=0.078
[PPO] it=   11 steps=   22528 avg10=-109.74 loss=29.249 pg=0.003 vf=58.548 H=5.663 KL=0.0064 clip_frac=0.077
[PPO] it=   16 steps=   32768 avg10=-109.35 loss=68.785 pg=0.002 vf=137.622 H=5.670 KL=0.0052 clip_frac=0.045
[PPO] it=   21 steps=   43008 avg10=-119.16 loss=18.634 pg=0.004 vf=37.318 H=5.674 KL=0.0073 clip_frac=0.079
[PPO] it=   26 steps=   53248 avg10=-108.16 loss=16.566 pg=0.006 vf=33.177 H=5.669 KL=0.0118 clip_frac=0.153
[PPO] it=   31 steps=   63488 avg10=-107.84 loss=0.067 pg=0.000 vf=0.191 H=5.661 KL=0.0075 clip_frac=0.082
[PPO] it=   36 steps=   73728 avg10=-106.44 loss=0.104 pg=-0.000 vf=0.265 H=5.652 KL=0.0068 clip_frac=0.071
[PPO] it=   41 steps=   83968 avg10= -92.16 loss=0.112 pg=-0.002 vf=0.285 H=5.627 KL=0.0071 clip_frac=0.078
[PPO] it=   46 steps=   94208 avg10= -77.70 loss=0.100 pg=0.002 vf=0.253 H=5.623 KL=0.0055 clip_frac=0.042
[PPO] it=   51 steps=  104448 avg10= -75.93 loss=0.080 pg=-0.002 vf=0.219 H=5.619 KL=0.0055 clip_frac=0.057
[PPO] it=   56 steps=  114688 avg10= -94.32 loss=57.640 pg=0.004 vf=115.329 H=5.617 KL=0.0079 clip_frac=0.111
[PPO] it=   61 steps=  124928 avg10=-109.90 loss=26.487 pg=0.001 vf=53.027 H=5.611 KL=0.0050 clip_frac=0.045
[PPO] it=   66 steps=  135168 avg10= -97.99 loss=1.330 pg=-0.000 vf=2.717 H=5.611 KL=0.0047 clip_frac=0.058
[PPO] it=   71 steps=  145408 avg10= -92.71 loss=9.571 pg=0.001 vf=19.196 H=5.616 KL=0.0045 clip_frac=0.026
[PPO] it=   76 steps=  155648 avg10= -81.41 loss=0.179 pg=-0.000 vf=0.415 H=5.592 KL=0.0052 clip_frac=0.072
[PPO] it=   81 steps=  165888 avg10= -76.71 loss=0.440 pg=-0.002 vf=0.940 H=5.573 KL=0.0057 clip_frac=0.070
[PPO] it=   86 steps=  176128 avg10= -75.00 loss=0.101 pg=-0.001 vf=0.261 H=5.553 KL=0.0050 clip_frac=0.036
[PPO] it=   91 steps=  186368 avg10= -65.24 loss=0.146 pg=-0.003 vf=0.354 H=5.529 KL=0.0070 clip_frac=0.067
[PPO] it=   96 steps=  196608 avg10=-112.14 loss=59.629 pg=-0.000 vf=119.314 H=5.526 KL=0.0052 clip_frac=0.046
[PPO] it=  101 steps=  206848 avg10=-113.44 loss=1.480 pg=-0.000 vf=3.016 H=5.521 KL=0.0070 clip_frac=0.088
[PPO] it=  106 steps=  217088 avg10=-125.16 loss=31.251 pg=0.005 vf=62.548 H=5.522 KL=0.0081 clip_frac=0.125
[PPO] it=  111 steps=  227328 avg10=-118.22 loss=77.304 pg=0.009 vf=154.645 H=5.517 KL=0.0142 clip_frac=0.230
[PPO] it=  116 steps=  237568 avg10=-103.91 loss=0.169 pg=-0.001 vf=0.396 H=5.513 KL=0.0079 clip_frac=0.127
[PPO] it=  121 steps=  247808 avg10= -72.94 loss=0.168 pg=-0.002 vf=0.395 H=5.490 KL=0.0061 clip_frac=0.065
[PPO] it=  126 steps=  258048 avg10= -89.87 loss=34.579 pg=-0.001 vf=69.215 H=5.479 KL=0.0052 clip_frac=0.022
[PPO] it=  131 steps=  268288 avg10= -82.82 loss=51.708 pg=0.006 vf=103.458 H=5.467 KL=0.0078 clip_frac=0.064
[PPO] it=  136 steps=  278528 avg10= -83.00 loss=0.291 pg=-0.002 vf=0.639 H=5.463 KL=0.0067 clip_frac=0.098
[PPO] it=  141 steps=  288768 avg10= -70.41 loss=0.390 pg=-0.000 vf=0.835 H=5.457 KL=0.0046 clip_frac=0.040
[PPO] it=  146 steps=  299008 avg10= -67.65 loss=0.163 pg=-0.002 vf=0.383 H=5.453 KL=0.0050 clip_frac=0.043
[PPO] it=  151 steps=  309248 avg10= -94.09 loss=0.353 pg=0.002 vf=0.756 H=5.444 KL=0.0064 clip_frac=0.054
[PPO] it=  156 steps=  319488 avg10= -74.04 loss=0.341 pg=-0.001 vf=0.737 H=5.443 KL=0.0051 clip_frac=0.022
[PPO] it=  161 steps=  329728 avg10= -64.94 loss=0.886 pg=-0.000 vf=1.827 H=5.431 KL=0.0058 clip_frac=0.076
[PPO] it=  166 steps=  339968 avg10= -61.43 loss=0.107 pg=-0.001 vf=0.270 H=5.424 KL=0.0063 clip_frac=0.054
[PPO] it=  171 steps=  350208 avg10= -52.24 loss=0.338 pg=-0.000 vf=0.730 H=5.416 KL=0.0070 clip_frac=0.060
[PPO] it=  176 steps=  360448 avg10= -61.93 loss=0.293 pg=-0.000 vf=0.639 H=5.408 KL=0.0040 clip_frac=0.024
[PPO] it=  181 steps=  370688 avg10= -53.22 loss=0.372 pg=-0.001 vf=0.801 H=5.388 KL=0.0069 clip_frac=0.065
[PPO] it=  186 steps=  380928 avg10= -45.87 loss=0.043 pg=-0.001 vf=0.142 H=5.367 KL=0.0058 clip_frac=0.056
[PPO] it=  191 steps=  391168 avg10= -41.68 loss=0.213 pg=-0.002 vf=0.483 H=5.361 KL=0.0051 clip_frac=0.047
[PPO] it=  196 steps=  401408 avg10= -33.24 loss=0.177 pg=-0.001 vf=0.410 H=5.333 KL=0.0061 clip_frac=0.059
[PPO] it=  201 steps=  411648 avg10= -24.88 loss=0.220 pg=-0.001 vf=0.495 H=5.320 KL=0.0063 clip_frac=0.059
[PPO] it=  206 steps=  421888 avg10= -14.40 loss=0.507 pg=-0.001 vf=1.068 H=5.293 KL=0.0061 clip_frac=0.055
[PPO] it=  211 steps=  432128 avg10=  -4.71 loss=0.464 pg=-0.002 vf=0.984 H=5.286 KL=0.0044 clip_frac=0.054
[PPO] it=  216 steps=  442368 avg10=   9.47 loss=0.327 pg=-0.001 vf=0.708 H=5.287 KL=0.0060 clip_frac=0.076
[PPO] it=  221 steps=  452608 avg10=  18.60 loss=0.249 pg=-0.002 vf=0.556 H=5.281 KL=0.0072 clip_frac=0.083
[PPO] it=  226 steps=  462848 avg10=  19.81 loss=0.417 pg=-0.002 vf=0.890 H=5.274 KL=0.0073 clip_frac=0.082
[PPO] it=  231 steps=  473088 avg10=  24.13 loss=0.522 pg=0.007 vf=1.083 H=5.274 KL=0.0095 clip_frac=0.144
[PPO] it=  236 steps=  483328 avg10=  17.36 loss=14.659 pg=0.005 vf=29.360 H=5.264 KL=0.0094 clip_frac=0.118
[PPO] it=  241 steps=  493568 avg10=  32.31 loss=0.858 pg=-0.000 vf=1.770 H=5.254 KL=0.0057 clip_frac=0.073
[PPO] it=  246 steps=  503808 avg10=  41.18 loss=1.216 pg=-0.001 vf=2.485 H=5.228 KL=0.0067 clip_frac=0.086
[PPO] it=  251 steps=  514048 avg10=  46.97 loss=0.412 pg=-0.001 vf=0.878 H=5.229 KL=0.0076 clip_frac=0.079
[PPO] it=  256 steps=  524288 avg10=  55.44 loss=0.773 pg=0.001 vf=1.596 H=5.230 KL=0.0061 clip_frac=0.068
[PPO] it=  261 steps=  534528 avg10=  27.30 loss=0.980 pg=-0.001 vf=2.014 H=5.227 KL=0.0052 clip_frac=0.053
[PPO] it=  266 steps=  544768 avg10= -50.04 loss=119.509 pg=0.082 vf=238.907 H=5.227 KL=0.0623 clip_frac=0.543
[PPO] it=  269 steps=  550000 avg10= -60.00 loss=24.441 pg=0.002 vf=48.930 H=5.227 KL=0.0058 clip_frac=0.050
[PPO] done steps=550000 time=855.5s avg10=-60.00
Saved BipedalWalker PPO run 10 model to a3_bonus_ppo_artifacts/bipedal_walker/run_10_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_10_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -69.53 steps 261
Eval episode 2 seed 1228 return -37.00 steps 519
Eval episode 3 seed 1229 return -19.32 steps 630
Eval episode 4 seed 1230 return -63.63 steps 699
Eval episode 5 seed 1231 return 2.32 steps 761
Eval episode 6 seed 1232 return -65.05 steps 304
Eval episode 7 seed 1233 return -68.21 steps 313
Eval episode 8 seed 1234 return -9.79 steps 658
Eval episode 9 seed 1235 return 29.04 steps 1073
Eval episode 10 seed 1236 return 9.93 steps 949
Greedy evaluation mean -29.12  std 34.74
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_10_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_10_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=9, seed=1235, return=29.04, steps=1073
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_10_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 29.04 steps 1073 with seed 1235 into a3_bonus_ppo_artifacts/bipedal_walker/run_10_bipedal_ppo/videos
Replayed best episode for video: return=29.04, steps=1073

Run#12

In [ ]:
# Fresh PPO model for BipedalWalker (reuse bipedal_cfg from earlier)
bipedal_model_run11 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# PPO hyperparameters for BipedalWalker (more conservative than run 7)
bipedal_ppo_cfg_run11 = PPOUpdateConfig(
    clip_range=0.15,      # tighter than 0.20/0.25 to avoid big policy jumps
    value_coef=0.5,
    entropy_coef=0.003,   # slightly lower than before, less noisy exploration
    max_grad_norm=0.5,
    n_epochs=10,          # a bit more optimization per batch
    batch_size=128,       # larger batch for smoother updates
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run11 = "run_11_bipedal_ppo"
bipedal_run_dir_run11 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run11)
print(f"BipedalWalker PPO run 11 dir: {bipedal_run_dir_run11}")

# Training budget
bipedal_total_steps_run11 = 800_000      # more than run 7 (500k) but not crazy
bipedal_rollout_len_run11 = 4096        # longer rollout for better gradient estimates

# Train PPO on BipedalWalker
bipedal_model_run11, bipedal_episode_returns_run11, bipedal_logs_run11 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run11,
    control_type="continuous",
    run_dir=bipedal_run_dir_run11,
    total_env_steps=bipedal_total_steps_run11,
    rollout_len=bipedal_rollout_len_run11,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run11,
    lr=2.5e-4,           # a bit smaller than 3e-4 for stability
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run11, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run11, dtype=np.float32),
)

bipedal_model_path_run11 = os.path.join(bipedal_run_dir_run11, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run11.state_dict(), bipedal_model_path_run11)
print(f"Saved BipedalWalker PPO run 11 model to {bipedal_model_path_run11}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run11,
    run_dir=bipedal_run_dir_run11,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 11)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run11 = os.path.join(bipedal_run_dir_run11, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run11 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run11,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run11,
)

# Save eval returns and plot
np.save(
    os.path.join(bipedal_run_dir_run11, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run11, dtype=np.float32),
)

plot_rewards(
    rewards=bipedal_eval_returns_run11,
    run_dir=bipedal_run_dir_run11,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 11)",
    ma_window=3,
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run11,
    control_type="continuous",
    run_dir=bipedal_run_dir_run11,
    csv_path=csv_path_bipedal_run11,
    max_steps=1600,
)
BipedalWalker PPO run 11 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_11_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-115.64 loss=79.425 pg=0.002 vf=158.882 H=5.701 KL=0.0100 clip_frac=0.176
[PPO] it=    4 steps=   16384 avg10=-122.22 loss=31.756 pg=0.004 vf=63.538 H=5.718 KL=0.0133 clip_frac=0.225
[PPO] it=    7 steps=   28672 avg10=-119.28 loss=31.378 pg=0.004 vf=62.782 H=5.743 KL=0.0088 clip_frac=0.139
[PPO] it=   10 steps=   40960 avg10=-109.18 loss=25.747 pg=0.004 vf=51.522 H=5.769 KL=0.0085 clip_frac=0.129
[PPO] it=   13 steps=   53248 avg10=-106.19 loss=19.617 pg=0.006 vf=39.257 H=5.768 KL=0.0117 clip_frac=0.207
[PPO] it=   16 steps=   65536 avg10=-112.72 loss=7.199 pg=0.004 vf=14.424 H=5.815 KL=0.0117 clip_frac=0.235
[PPO] it=   19 steps=   77824 avg10=-114.13 loss=2.634 pg=0.005 vf=5.292 H=5.829 KL=0.0235 clip_frac=0.359
[PPO] it=   22 steps=   90112 avg10=-110.55 loss=12.992 pg=0.006 vf=26.006 H=5.881 KL=0.0166 clip_frac=0.303
[PPO] it=   25 steps=  102400 avg10=-110.17 loss=23.709 pg=0.005 vf=47.443 H=5.874 KL=0.0107 clip_frac=0.202
[PPO] it=   28 steps=  114688 avg10=-108.72 loss=0.403 pg=0.005 vf=0.832 H=5.840 KL=0.0200 clip_frac=0.337
[PPO] it=   31 steps=  126976 avg10=-102.26 loss=0.126 pg=0.003 vf=0.280 H=5.811 KL=0.0102 clip_frac=0.204
[PPO] it=   34 steps=  139264 avg10=-105.55 loss=8.557 pg=0.001 vf=17.147 H=5.792 KL=0.0089 clip_frac=0.131
[PPO] it=   37 steps=  151552 avg10= -99.37 loss=0.174 pg=-0.001 vf=0.383 H=5.802 KL=0.0098 clip_frac=0.166
[PPO] it=   40 steps=  163840 avg10= -90.30 loss=0.258 pg=0.002 vf=0.546 H=5.760 KL=0.0092 clip_frac=0.176
[PPO] it=   43 steps=  176128 avg10= -81.85 loss=0.174 pg=0.001 vf=0.381 H=5.667 KL=0.0094 clip_frac=0.184
[PPO] it=   46 steps=  188416 avg10= -73.26 loss=0.132 pg=-0.001 vf=0.298 H=5.635 KL=0.0080 clip_frac=0.125
[PPO] it=   49 steps=  200704 avg10= -69.83 loss=0.086 pg=0.001 vf=0.206 H=5.629 KL=0.0084 clip_frac=0.123
[PPO] it=   52 steps=  212992 avg10= -71.99 loss=22.465 pg=0.011 vf=44.941 H=5.604 KL=0.0100 clip_frac=0.130
[PPO] it=   55 steps=  225280 avg10= -72.72 loss=0.336 pg=-0.001 vf=0.707 H=5.614 KL=0.0104 clip_frac=0.168
[PPO] it=   58 steps=  237568 avg10= -70.95 loss=12.488 pg=0.007 vf=24.996 H=5.626 KL=0.0124 clip_frac=0.189
[PPO] it=   61 steps=  249856 avg10= -59.39 loss=0.406 pg=0.002 vf=0.841 H=5.612 KL=0.0102 clip_frac=0.176
[PPO] it=   64 steps=  262144 avg10= -56.32 loss=0.687 pg=0.001 vf=1.405 H=5.611 KL=0.0102 clip_frac=0.174
[PPO] it=   67 steps=  274432 avg10= -68.84 loss=5.054 pg=0.001 vf=10.139 H=5.603 KL=0.0075 clip_frac=0.106
[PPO] it=   70 steps=  286720 avg10= -53.05 loss=0.480 pg=0.001 vf=0.993 H=5.566 KL=0.0097 clip_frac=0.151
[PPO] it=   73 steps=  299008 avg10= -32.08 loss=0.328 pg=0.001 vf=0.687 H=5.538 KL=0.0092 clip_frac=0.161
[PPO] it=   76 steps=  311296 avg10= -16.67 loss=0.462 pg=-0.001 vf=0.958 H=5.462 KL=0.0074 clip_frac=0.115
[PPO] it=   79 steps=  323584 avg10= -14.77 loss=0.393 pg=-0.001 vf=0.820 H=5.385 KL=0.0101 clip_frac=0.139
[PPO] it=   82 steps=  335872 avg10=   7.09 loss=0.849 pg=-0.002 vf=1.733 H=5.336 KL=0.0090 clip_frac=0.143
[PPO] it=   85 steps=  348160 avg10=  18.57 loss=0.487 pg=-0.003 vf=1.011 H=5.309 KL=0.0096 clip_frac=0.159
[PPO] it=   88 steps=  360448 avg10=  19.17 loss=0.818 pg=-0.001 vf=1.671 H=5.297 KL=0.0087 clip_frac=0.142
[PPO] it=   91 steps=  372736 avg10=  38.43 loss=0.968 pg=-0.000 vf=1.967 H=5.267 KL=0.0095 clip_frac=0.136
[PPO] it=   94 steps=  385024 avg10=  48.77 loss=0.543 pg=-0.002 vf=1.121 H=5.247 KL=0.0094 clip_frac=0.154
[PPO] it=   97 steps=  397312 avg10=  60.98 loss=1.120 pg=-0.003 vf=2.277 H=5.199 KL=0.0086 clip_frac=0.149
[PPO] it=  100 steps=  409600 avg10=  72.12 loss=0.912 pg=-0.000 vf=1.856 H=5.157 KL=0.0091 clip_frac=0.124
[PPO] it=  103 steps=  421888 avg10=  81.43 loss=0.698 pg=-0.001 vf=1.428 H=5.115 KL=0.0094 clip_frac=0.149
[PPO] it=  106 steps=  434176 avg10=  92.97 loss=0.963 pg=-0.002 vf=1.960 H=5.082 KL=0.0084 clip_frac=0.151
[PPO] it=  109 steps=  446464 avg10=  61.46 loss=10.318 pg=0.002 vf=20.663 H=5.065 KL=0.0141 clip_frac=0.200
[PPO] it=  112 steps=  458752 avg10=  89.07 loss=1.029 pg=0.000 vf=2.087 H=4.988 KL=0.0080 clip_frac=0.155
[PPO] it=  115 steps=  471040 avg10= 118.19 loss=0.696 pg=-0.004 vf=1.431 H=4.955 KL=0.0086 clip_frac=0.149
[PPO] it=  118 steps=  483328 avg10= 113.28 loss=14.726 pg=-0.003 vf=29.487 H=4.977 KL=0.0412 clip_frac=0.259
[PPO] it=  121 steps=  495616 avg10=  89.72 loss=0.736 pg=0.005 vf=1.491 H=4.975 KL=0.0146 clip_frac=0.237
[PPO] it=  124 steps=  507904 avg10=  71.11 loss=9.793 pg=0.000 vf=19.615 H=4.949 KL=0.0131 clip_frac=0.197
[PPO] it=  127 steps=  520192 avg10=  98.84 loss=1.051 pg=-0.002 vf=2.135 H=4.954 KL=0.0100 clip_frac=0.178
[PPO] it=  130 steps=  532480 avg10=  64.93 loss=0.777 pg=0.007 vf=1.571 H=4.937 KL=0.0152 clip_frac=0.265
[PPO] it=  133 steps=  544768 avg10=  77.87 loss=19.474 pg=0.032 vf=38.913 H=4.917 KL=0.0407 clip_frac=0.370
[PPO] it=  136 steps=  557056 avg10= 104.59 loss=0.380 pg=-0.000 vf=0.790 H=4.895 KL=0.0112 clip_frac=0.183
[PPO] it=  139 steps=  569344 avg10= 136.73 loss=1.036 pg=0.001 vf=2.100 H=4.883 KL=0.0104 clip_frac=0.180
[PPO] it=  142 steps=  581632 avg10= 126.21 loss=0.803 pg=0.002 vf=1.630 H=4.862 KL=0.0122 clip_frac=0.225
[PPO] it=  145 steps=  593920 avg10= 146.08 loss=0.686 pg=-0.002 vf=1.405 H=4.853 KL=0.0099 clip_frac=0.164
[PPO] it=  148 steps=  606208 avg10= 135.50 loss=0.855 pg=0.014 vf=1.710 H=4.846 KL=0.0144 clip_frac=0.227
[PPO] it=  151 steps=  618496 avg10= 106.79 loss=25.808 pg=0.001 vf=51.644 H=4.837 KL=0.0710 clip_frac=0.304
[PPO] it=  154 steps=  630784 avg10=  77.32 loss=8.669 pg=0.024 vf=17.321 H=4.879 KL=0.0609 clip_frac=0.294
[PPO] it=  157 steps=  643072 avg10=  95.27 loss=0.764 pg=0.008 vf=1.542 H=4.849 KL=0.0121 clip_frac=0.247
[PPO] it=  160 steps=  655360 avg10= 121.38 loss=1.574 pg=0.015 vf=3.147 H=4.843 KL=0.0157 clip_frac=0.256
[PPO] it=  163 steps=  667648 avg10= 111.85 loss=2.074 pg=0.102 vf=3.972 H=4.869 KL=0.1419 clip_frac=0.457
[PPO] it=  166 steps=  679936 avg10= 108.00 loss=1.063 pg=0.006 vf=2.143 H=4.874 KL=0.0137 clip_frac=0.246
[PPO] it=  169 steps=  692224 avg10=  91.10 loss=1.455 pg=0.024 vf=2.892 H=4.902 KL=0.0260 clip_frac=0.351
[PPO] it=  172 steps=  704512 avg10= 116.66 loss=31.512 pg=0.019 vf=63.015 H=4.900 KL=0.0735 clip_frac=0.393
[PPO] it=  175 steps=  716800 avg10=  76.00 loss=1.602 pg=0.004 vf=3.226 H=4.899 KL=0.0122 clip_frac=0.219
[PPO] it=  178 steps=  729088 avg10=  87.13 loss=1.375 pg=0.002 vf=2.775 H=4.896 KL=0.0149 clip_frac=0.259
[PPO] it=  181 steps=  741376 avg10=  85.05 loss=1.403 pg=0.014 vf=2.808 H=4.885 KL=0.0199 clip_frac=0.296
[PPO] it=  184 steps=  753664 avg10= 108.15 loss=1.863 pg=0.003 vf=3.748 H=4.874 KL=0.0197 clip_frac=0.301
[PPO] it=  187 steps=  765952 avg10= 107.72 loss=1.295 pg=0.006 vf=2.607 H=4.901 KL=0.0198 clip_frac=0.263
[PPO] it=  190 steps=  778240 avg10= 113.30 loss=1.236 pg=0.001 vf=2.500 H=4.928 KL=0.0139 clip_frac=0.223
[PPO] it=  193 steps=  790528 avg10=  93.25 loss=1.341 pg=0.042 vf=2.627 H=4.935 KL=0.0412 clip_frac=0.311
[PPO] it=  196 steps=  800000 avg10=  87.12 loss=1.167 pg=-0.002 vf=2.368 H=4.953 KL=0.0154 clip_frac=0.246
[PPO] done steps=800000 time=1048.9s avg10=87.12
Saved BipedalWalker PPO run 11 model to a3_bonus_ppo_artifacts/bipedal_walker/run_11_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_11_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 150.75 steps 1600
Eval episode 2 seed 1228 return 156.43 steps 1600
Eval episode 3 seed 1229 return 111.33 steps 1600
Eval episode 4 seed 1230 return 153.33 steps 1600
Eval episode 5 seed 1231 return 111.51 steps 1600
Eval episode 6 seed 1232 return 126.86 steps 1600
Eval episode 7 seed 1233 return 133.61 steps 1600
Eval episode 8 seed 1234 return -81.70 steps 505
Eval episode 9 seed 1235 return -99.99 steps 550
Eval episode 10 seed 1236 return 156.71 steps 1600
Greedy evaluation mean 91.88  std 92.89
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_11_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_11_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=10, seed=1236, return=156.71, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_11_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 156.71 steps 1600 with seed 1236 into a3_bonus_ppo_artifacts/bipedal_walker/run_11_bipedal_ppo/videos
Replayed best episode for video: return=156.71, steps=1600

Run#12

In [ ]:
# Discover BipedalWalker dimensions (reuse pattern)
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config for continuous control (same as good runs)
bipedal_cfg = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run12 = build_ppo_continuous_model_from_config(bipedal_cfg).to(device)

# More conservative PPO hyperparameters
bipedal_ppo_cfg_run12 = PPOUpdateConfig(
    clip_range=0.10,      # tighter than 0.2 to avoid huge policy jumps
    value_coef=0.5,
    entropy_coef=0.001,   # small exploration, not 0.01
    max_grad_norm=0.5,
    n_epochs=4,           # keep epochs but rely on smaller lr + clip
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run12 = "run_12_bipedal_ppo"
bipedal_run_dir_run12 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run12)
print(f"BipedalWalker PPO run 12 dir: {bipedal_run_dir_run12}")

# Training budget (larger rollout, more total steps)
bipedal_total_steps_run12 = 800_000
bipedal_rollout_len_run12 = 4096

# Train PPO on BipedalWalker
bipedal_model_run12, bipedal_episode_returns_run12, bipedal_logs_run12 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run12,
    control_type="continuous",
    run_dir=bipedal_run_dir_run12,
    total_env_steps=bipedal_total_steps_run12,
    rollout_len=bipedal_rollout_len_run12,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run12,
    lr=1e-4,              # much smaller step size than before
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run12, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run12, dtype=np.float32),
)

bipedal_model_path_run12 = os.path.join(bipedal_run_dir_run12, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run12.state_dict(), bipedal_model_path_run12)
print(f"Saved BipedalWalker PPO run 12 model to {bipedal_model_path_run12}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run12,
    run_dir=bipedal_run_dir_run12,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 12)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run12 = os.path.join(bipedal_run_dir_run12, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run12 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run12,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run12,
)

# save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run12, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run12, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run12,
    run_dir=bipedal_run_dir_run12,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 12)",
    ma_window=3,
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run12,
    control_type="continuous",
    run_dir=bipedal_run_dir_run12,
    csv_path=csv_path_bipedal_run12,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 12 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_12_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-112.01 loss=255.940 pg=0.001 vf=511.890 H=5.677 KL=0.0037 clip_frac=0.043
[PPO] it=    4 steps=   16384 avg10=-109.60 loss=55.577 pg=0.001 vf=111.164 H=5.679 KL=0.0047 clip_frac=0.060
[PPO] it=    7 steps=   28672 avg10=-108.40 loss=27.341 pg=0.002 vf=54.689 H=5.683 KL=0.0074 clip_frac=0.129
[PPO] it=   10 steps=   40960 avg10=-107.85 loss=14.552 pg=0.002 vf=29.113 H=5.684 KL=0.0066 clip_frac=0.123
[PPO] it=   13 steps=   53248 avg10=-105.70 loss=28.239 pg=0.003 vf=56.483 H=5.684 KL=0.0066 clip_frac=0.115
[PPO] it=   16 steps=   65536 avg10= -99.55 loss=36.988 pg=0.002 vf=73.983 H=5.680 KL=0.0066 clip_frac=0.118
[PPO] it=   19 steps=   77824 avg10=-101.53 loss=37.520 pg=0.004 vf=75.045 H=5.692 KL=0.0069 clip_frac=0.115
[PPO] it=   22 steps=   90112 avg10= -98.79 loss=33.087 pg=0.002 vf=66.181 H=5.682 KL=0.0064 clip_frac=0.092
[PPO] it=   25 steps=  102400 avg10=-108.74 loss=33.335 pg=0.001 vf=66.679 H=5.679 KL=0.0072 clip_frac=0.136
[PPO] it=   28 steps=  114688 avg10=-104.69 loss=0.461 pg=0.004 vf=0.925 H=5.665 KL=0.0085 clip_frac=0.209
[PPO] it=   31 steps=  126976 avg10= -97.85 loss=0.392 pg=0.003 vf=0.790 H=5.653 KL=0.0085 clip_frac=0.190
[PPO] it=   34 steps=  139264 avg10= -95.26 loss=0.111 pg=0.001 vf=0.231 H=5.638 KL=0.0083 clip_frac=0.190
[PPO] it=   37 steps=  151552 avg10= -88.71 loss=0.170 pg=-0.001 vf=0.353 H=5.622 KL=0.0079 clip_frac=0.170
[PPO] it=   40 steps=  163840 avg10= -92.14 loss=10.753 pg=0.001 vf=21.514 H=5.616 KL=0.0065 clip_frac=0.095
[PPO] it=   43 steps=  176128 avg10= -91.35 loss=0.321 pg=0.000 vf=0.652 H=5.600 KL=0.0072 clip_frac=0.157
[PPO] it=   46 steps=  188416 avg10= -81.91 loss=0.174 pg=0.001 vf=0.358 H=5.584 KL=0.0064 clip_frac=0.116
[PPO] it=   49 steps=  200704 avg10= -79.85 loss=0.133 pg=0.001 vf=0.275 H=5.569 KL=0.0071 clip_frac=0.122
[PPO] it=   52 steps=  212992 avg10= -79.40 loss=7.999 pg=0.006 vf=15.996 H=5.553 KL=0.0073 clip_frac=0.121
[PPO] it=   55 steps=  225280 avg10= -76.79 loss=0.163 pg=-0.000 vf=0.337 H=5.538 KL=0.0062 clip_frac=0.123
[PPO] it=   58 steps=  237568 avg10= -79.49 loss=0.706 pg=0.001 vf=1.420 H=5.538 KL=0.0071 clip_frac=0.128
[PPO] it=   61 steps=  249856 avg10= -73.69 loss=0.121 pg=0.002 vf=0.250 H=5.519 KL=0.0070 clip_frac=0.156
[PPO] it=   64 steps=  262144 avg10= -74.09 loss=0.140 pg=-0.000 vf=0.290 H=5.506 KL=0.0061 clip_frac=0.121
[PPO] it=   67 steps=  274432 avg10= -75.49 loss=10.450 pg=0.002 vf=20.906 H=5.493 KL=0.0060 clip_frac=0.083
[PPO] it=   70 steps=  286720 avg10= -84.21 loss=0.347 pg=0.002 vf=0.703 H=5.484 KL=0.0064 clip_frac=0.134
[PPO] it=   73 steps=  299008 avg10= -73.68 loss=2.703 pg=0.002 vf=5.412 H=5.476 KL=0.0067 clip_frac=0.101
[PPO] it=   76 steps=  311296 avg10= -77.70 loss=0.449 pg=0.001 vf=0.905 H=5.473 KL=0.0065 clip_frac=0.119
[PPO] it=   79 steps=  323584 avg10= -69.55 loss=0.159 pg=0.001 vf=0.327 H=5.453 KL=0.0060 clip_frac=0.106
[PPO] it=   82 steps=  335872 avg10= -66.49 loss=0.226 pg=0.001 vf=0.461 H=5.443 KL=0.0068 clip_frac=0.104
[PPO] it=   85 steps=  348160 avg10= -63.32 loss=0.152 pg=-0.001 vf=0.317 H=5.427 KL=0.0064 clip_frac=0.138
[PPO] it=   88 steps=  360448 avg10= -69.61 loss=0.262 pg=0.001 vf=0.533 H=5.412 KL=0.0072 clip_frac=0.106
[PPO] it=   91 steps=  372736 avg10= -67.84 loss=0.370 pg=0.001 vf=0.749 H=5.394 KL=0.0068 clip_frac=0.122
[PPO] it=   94 steps=  385024 avg10= -62.13 loss=0.262 pg=-0.000 vf=0.536 H=5.385 KL=0.0066 clip_frac=0.104
[PPO] it=   97 steps=  397312 avg10= -64.37 loss=0.647 pg=0.002 vf=1.302 H=5.375 KL=0.0070 clip_frac=0.144
[PPO] it=  100 steps=  409600 avg10= -62.58 loss=0.298 pg=0.001 vf=0.604 H=5.371 KL=0.0062 clip_frac=0.114
[PPO] it=  103 steps=  421888 avg10= -92.65 loss=11.291 pg=0.002 vf=22.587 H=5.364 KL=0.0056 clip_frac=0.087
[PPO] it=  106 steps=  434176 avg10= -97.94 loss=0.303 pg=0.002 vf=0.613 H=5.358 KL=0.0080 clip_frac=0.171
[PPO] it=  109 steps=  446464 avg10= -70.80 loss=14.784 pg=0.007 vf=29.564 H=5.352 KL=0.0079 clip_frac=0.068
[PPO] it=  112 steps=  458752 avg10= -67.26 loss=0.441 pg=0.001 vf=0.890 H=5.329 KL=0.0076 clip_frac=0.147
[PPO] it=  115 steps=  471040 avg10= -95.46 loss=10.337 pg=-0.001 vf=20.686 H=5.321 KL=0.0052 clip_frac=0.074
[PPO] it=  118 steps=  483328 avg10= -90.38 loss=7.929 pg=0.003 vf=15.861 H=5.326 KL=0.0074 clip_frac=0.131
[PPO] it=  121 steps=  495616 avg10= -68.78 loss=0.441 pg=0.000 vf=0.891 H=5.303 KL=0.0065 clip_frac=0.112
[PPO] it=  124 steps=  507904 avg10= -49.09 loss=0.186 pg=0.001 vf=0.381 H=5.288 KL=0.0070 clip_frac=0.139
[PPO] it=  127 steps=  520192 avg10= -73.88 loss=0.676 pg=0.003 vf=1.357 H=5.287 KL=0.0064 clip_frac=0.140
[PPO] it=  130 steps=  532480 avg10= -52.21 loss=0.655 pg=0.001 vf=1.320 H=5.280 KL=0.0075 clip_frac=0.148
[PPO] it=  133 steps=  544768 avg10= -45.88 loss=0.397 pg=0.000 vf=0.804 H=5.260 KL=0.0074 clip_frac=0.154
[PPO] it=  136 steps=  557056 avg10= -51.43 loss=0.453 pg=0.002 vf=0.912 H=5.250 KL=0.0070 clip_frac=0.147
[PPO] it=  139 steps=  569344 avg10= -39.69 loss=0.322 pg=0.000 vf=0.654 H=5.246 KL=0.0077 clip_frac=0.159
[PPO] it=  142 steps=  581632 avg10= -34.69 loss=0.310 pg=0.001 vf=0.627 H=5.234 KL=0.0067 clip_frac=0.132
[PPO] it=  145 steps=  593920 avg10= -26.55 loss=0.246 pg=0.001 vf=0.501 H=5.222 KL=0.0070 clip_frac=0.135
[PPO] it=  148 steps=  606208 avg10= -21.98 loss=7.831 pg=0.000 vf=15.672 H=5.196 KL=0.0072 clip_frac=0.123
[PPO] it=  151 steps=  618496 avg10= -30.99 loss=0.790 pg=0.001 vf=1.587 H=5.199 KL=0.0071 clip_frac=0.178
[PPO] it=  154 steps=  630784 avg10= -39.69 loss=0.521 pg=0.002 vf=1.048 H=5.192 KL=0.0085 clip_frac=0.178
[PPO] it=  157 steps=  643072 avg10= -38.38 loss=23.059 pg=0.003 vf=46.122 H=5.177 KL=0.0061 clip_frac=0.094
[PPO] it=  160 steps=  655360 avg10= -31.20 loss=0.679 pg=0.003 vf=1.364 H=5.182 KL=0.0069 clip_frac=0.142
[PPO] it=  163 steps=  667648 avg10= -26.75 loss=0.358 pg=0.003 vf=0.720 H=5.175 KL=0.0069 clip_frac=0.164
[PPO] it=  166 steps=  679936 avg10= -21.71 loss=12.020 pg=0.001 vf=24.049 H=5.167 KL=0.0070 clip_frac=0.144
[PPO] it=  169 steps=  692224 avg10= -24.00 loss=0.278 pg=0.000 vf=0.566 H=5.153 KL=0.0072 clip_frac=0.146
[PPO] it=  172 steps=  704512 avg10=  -6.40 loss=0.449 pg=0.001 vf=0.906 H=5.130 KL=0.0069 clip_frac=0.131
[PPO] it=  175 steps=  716800 avg10=   0.55 loss=0.276 pg=-0.000 vf=0.563 H=5.109 KL=0.0069 clip_frac=0.128
[PPO] it=  178 steps=  729088 avg10= -10.83 loss=0.678 pg=0.002 vf=1.362 H=5.099 KL=0.0083 clip_frac=0.180
[PPO] it=  181 steps=  741376 avg10=  -0.70 loss=0.330 pg=0.001 vf=0.668 H=5.077 KL=0.0069 clip_frac=0.151
[PPO] it=  184 steps=  753664 avg10= -17.26 loss=5.217 pg=0.003 vf=10.438 H=5.063 KL=0.0074 clip_frac=0.115
[PPO] it=  187 steps=  765952 avg10=  -8.03 loss=0.340 pg=0.001 vf=0.688 H=5.062 KL=0.0081 clip_frac=0.171
[PPO] it=  190 steps=  778240 avg10= -40.67 loss=41.268 pg=0.003 vf=82.541 H=5.051 KL=0.0067 clip_frac=0.090
[PPO] it=  193 steps=  790528 avg10= -28.89 loss=10.952 pg=0.001 vf=21.914 H=5.048 KL=0.0067 clip_frac=0.154
[PPO] it=  196 steps=  800000 avg10=   0.68 loss=1.036 pg=0.005 vf=2.073 H=5.039 KL=0.0087 clip_frac=0.232
[PPO] done steps=800000 time=979.6s avg10=0.68
Saved BipedalWalker PPO run 12 model to a3_bonus_ppo_artifacts/bipedal_walker/run_12_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_12_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 215.66 steps 1600
Eval episode 2 seed 1228 return 219.11 steps 1600
Eval episode 3 seed 1229 return 197.27 steps 1600
Eval episode 4 seed 1230 return 204.04 steps 1600
Eval episode 5 seed 1231 return -20.26 steps 732
Eval episode 6 seed 1232 return 183.99 steps 1600
Eval episode 7 seed 1233 return 84.40 steps 1535
Eval episode 8 seed 1234 return 192.51 steps 1600
Eval episode 9 seed 1235 return 206.02 steps 1600
Eval episode 10 seed 1236 return 182.01 steps 1600
Greedy evaluation mean 166.47  std 72.07
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_12_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_12_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=2, seed=1228, return=219.11, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_12_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 219.11 steps 1600 with seed 1228 into a3_bonus_ppo_artifacts/bipedal_walker/run_12_bipedal_ppo/videos
Replayed best episode for video: return=219.11, steps=1600

Run#13

In [ ]:
# Discover BipedalWalker dimensions (reuse pattern)
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config for continuous control
bipedal_cfg_run13 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run13 = build_ppo_continuous_model_from_config(bipedal_cfg_run13).to(device)

# PPO hyperparameters (same as run 12 but higher lr later)
bipedal_ppo_cfg_run13 = PPOUpdateConfig(
    clip_range=0.10,      # tight clip to avoid huge policy jumps
    value_coef=0.5,
    entropy_coef=0.001,
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run13 = "run_13_bipedal_ppo"
bipedal_run_dir_run13 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run13)
print(f"BipedalWalker PPO run 13 dir: {bipedal_run_dir_run13}")

# Training budget (a bit more total steps than run 12)
bipedal_total_steps_run13 = 900_000
bipedal_rollout_len_run13 = 4096

# Train PPO on BipedalWalker (higher learning rate)
bipedal_model_run13, bipedal_episode_returns_run13, bipedal_logs_run13 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run13,
    control_type="continuous",
    run_dir=bipedal_run_dir_run13,
    total_env_steps=bipedal_total_steps_run13,
    rollout_len=bipedal_rollout_len_run13,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run13,
    lr=2e-4,              # higher than run 12 (1e-4)
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run13, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run13, dtype=np.float32),
)

bipedal_model_path_run13 = os.path.join(bipedal_run_dir_run13, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run13.state_dict(), bipedal_model_path_run13)
print(f"Saved BipedalWalker PPO run 13 model to {bipedal_model_path_run13}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run13,
    run_dir=bipedal_run_dir_run13,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 13)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run13 = os.path.join(bipedal_run_dir_run13, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run13 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run13,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run13,
)

# save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run13, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run13, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run13,
    run_dir=bipedal_run_dir_run13,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 13)",
    ma_window=3,
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run13,
    control_type="continuous",
    run_dir=bipedal_run_dir_run13,
    csv_path=csv_path_bipedal_run13,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 13 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_13_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-116.22 loss=83.788 pg=0.002 vf=167.583 H=5.673 KL=0.0083 clip_frac=0.180
[PPO] it=    4 steps=   16384 avg10=-106.85 loss=63.199 pg=0.002 vf=126.406 H=5.658 KL=0.0076 clip_frac=0.111
[PPO] it=    7 steps=   28672 avg10=-105.99 loss=13.746 pg=0.013 vf=27.477 H=5.654 KL=0.0115 clip_frac=0.280
[PPO] it=   10 steps=   40960 avg10=-104.50 loss=0.789 pg=0.006 vf=1.576 H=5.659 KL=0.0128 clip_frac=0.324
[PPO] it=   13 steps=   53248 avg10=-110.25 loss=0.289 pg=0.003 vf=0.583 H=5.646 KL=0.0096 clip_frac=0.230
[PPO] it=   16 steps=   65536 avg10=-107.82 loss=18.903 pg=0.007 vf=37.804 H=5.621 KL=0.0094 clip_frac=0.167
[PPO] it=   19 steps=   77824 avg10=-105.83 loss=0.036 pg=0.002 vf=0.079 H=5.558 KL=0.0088 clip_frac=0.204
[PPO] it=   22 steps=   90112 avg10= -98.82 loss=0.092 pg=0.003 vf=0.189 H=5.539 KL=0.0086 clip_frac=0.191
[PPO] it=   25 steps=  102400 avg10= -96.77 loss=0.072 pg=0.000 vf=0.155 H=5.488 KL=0.0083 clip_frac=0.194
[PPO] it=   28 steps=  114688 avg10= -92.17 loss=0.064 pg=-0.001 vf=0.141 H=5.436 KL=0.0071 clip_frac=0.172
[PPO] it=   31 steps=  126976 avg10= -91.07 loss=0.167 pg=0.001 vf=0.345 H=5.420 KL=0.0069 clip_frac=0.144
[PPO] it=   34 steps=  139264 avg10= -89.67 loss=0.139 pg=0.001 vf=0.285 H=5.376 KL=0.0062 clip_frac=0.128
[PPO] it=   37 steps=  151552 avg10= -87.98 loss=17.050 pg=0.004 vf=34.103 H=5.362 KL=0.0068 clip_frac=0.099
[PPO] it=   40 steps=  163840 avg10= -87.32 loss=0.210 pg=0.001 vf=0.428 H=5.320 KL=0.0069 clip_frac=0.168
[PPO] it=   43 steps=  176128 avg10= -83.05 loss=0.066 pg=-0.001 vf=0.144 H=5.289 KL=0.0067 clip_frac=0.130
[PPO] it=   46 steps=  188416 avg10= -80.21 loss=0.090 pg=-0.001 vf=0.193 H=5.284 KL=0.0068 clip_frac=0.123
[PPO] it=   49 steps=  200704 avg10= -77.40 loss=0.143 pg=0.000 vf=0.295 H=5.262 KL=0.0069 clip_frac=0.132
[PPO] it=   52 steps=  212992 avg10= -71.22 loss=0.101 pg=0.000 vf=0.212 H=5.227 KL=0.0077 clip_frac=0.137
[PPO] it=   55 steps=  225280 avg10= -72.59 loss=0.075 pg=-0.000 vf=0.160 H=5.188 KL=0.0064 clip_frac=0.109
[PPO] it=   58 steps=  237568 avg10= -68.19 loss=0.106 pg=-0.000 vf=0.223 H=5.149 KL=0.0065 clip_frac=0.101
[PPO] it=   61 steps=  249856 avg10= -67.72 loss=0.096 pg=0.000 vf=0.201 H=5.127 KL=0.0058 clip_frac=0.114
[PPO] it=   64 steps=  262144 avg10= -68.97 loss=5.111 pg=0.008 vf=10.217 H=5.130 KL=0.0404 clip_frac=0.153
[PPO] it=   67 steps=  274432 avg10= -70.67 loss=0.085 pg=-0.000 vf=0.182 H=5.116 KL=0.0067 clip_frac=0.113
[PPO] it=   70 steps=  286720 avg10= -60.01 loss=0.261 pg=-0.002 vf=0.536 H=5.086 KL=0.0071 clip_frac=0.130
[PPO] it=   73 steps=  299008 avg10= -67.06 loss=4.912 pg=0.014 vf=9.807 H=5.077 KL=0.1343 clip_frac=0.167
[PPO] it=   76 steps=  311296 avg10= -57.26 loss=0.317 pg=0.001 vf=0.642 H=5.063 KL=0.0081 clip_frac=0.154
[PPO] it=   79 steps=  323584 avg10=-123.70 loss=105.196 pg=0.119 vf=210.164 H=5.048 KL=14.3194 clip_frac=0.861
[PPO] it=   82 steps=  335872 avg10=-115.49 loss=6.990 pg=0.012 vf=13.966 H=5.051 KL=0.0191 clip_frac=0.432
[PPO] it=   85 steps=  348160 avg10=-116.01 loss=8.043 pg=0.009 vf=16.079 H=5.055 KL=0.0168 clip_frac=0.422
[PPO] it=   88 steps=  360448 avg10=-116.42 loss=9.328 pg=0.012 vf=18.641 H=5.055 KL=0.0233 clip_frac=0.444
[PPO] it=   91 steps=  372736 avg10=-117.87 loss=5.199 pg=0.014 vf=10.380 H=5.061 KL=0.0242 clip_frac=0.493
[PPO] it=   94 steps=  385024 avg10=-114.75 loss=4.897 pg=0.013 vf=9.779 H=5.072 KL=0.0239 clip_frac=0.504
[PPO] it=   97 steps=  397312 avg10=-113.60 loss=3.134 pg=0.013 vf=6.252 H=5.077 KL=0.0213 clip_frac=0.497
[PPO] it=  100 steps=  409600 avg10=-113.48 loss=5.449 pg=0.013 vf=10.883 H=5.092 KL=0.0202 clip_frac=0.479
[PPO] it=  103 steps=  421888 avg10=-113.70 loss=4.637 pg=0.008 vf=9.269 H=5.090 KL=0.0166 clip_frac=0.430
[PPO] it=  106 steps=  434176 avg10=-114.49 loss=6.565 pg=0.009 vf=13.123 H=5.107 KL=0.0178 clip_frac=0.428
[PPO] it=  109 steps=  446464 avg10=-112.95 loss=4.256 pg=0.012 vf=8.500 H=5.118 KL=0.0207 clip_frac=0.486
[PPO] it=  112 steps=  458752 avg10=-114.20 loss=7.484 pg=0.009 vf=14.961 H=5.108 KL=0.0181 clip_frac=0.397
[PPO] it=  115 steps=  471040 avg10=-114.71 loss=3.604 pg=0.012 vf=7.194 H=5.113 KL=0.0190 clip_frac=0.452
[PPO] it=  118 steps=  483328 avg10=-114.92 loss=2.411 pg=0.017 vf=4.798 H=5.145 KL=0.0306 clip_frac=0.568
[PPO] it=  121 steps=  495616 avg10=-115.77 loss=4.991 pg=0.007 vf=9.978 H=5.143 KL=0.0159 clip_frac=0.424
[PPO] it=  124 steps=  507904 avg10=-114.62 loss=2.983 pg=0.016 vf=5.945 H=5.142 KL=0.0260 clip_frac=0.538
[PPO] it=  127 steps=  520192 avg10=-114.08 loss=2.951 pg=0.010 vf=5.891 H=5.162 KL=0.0177 clip_frac=0.442
[PPO] it=  130 steps=  532480 avg10=-115.58 loss=1.685 pg=0.010 vf=3.362 H=5.201 KL=0.0192 clip_frac=0.449
[PPO] it=  133 steps=  544768 avg10=-115.06 loss=2.481 pg=0.012 vf=4.949 H=5.217 KL=0.0214 clip_frac=0.496
[PPO] it=  136 steps=  557056 avg10=-113.20 loss=2.341 pg=0.013 vf=4.667 H=5.225 KL=0.0242 clip_frac=0.507
[PPO] it=  139 steps=  569344 avg10=-112.59 loss=3.443 pg=0.010 vf=6.878 H=5.215 KL=0.0156 clip_frac=0.416
[PPO] it=  142 steps=  581632 avg10=-112.69 loss=4.013 pg=0.009 vf=8.018 H=5.242 KL=0.0176 clip_frac=0.418
[PPO] it=  145 steps=  593920 avg10=-114.57 loss=2.198 pg=0.011 vf=4.383 H=5.223 KL=0.0161 clip_frac=0.408
[PPO] it=  148 steps=  606208 avg10=-117.41 loss=2.769 pg=0.012 vf=5.524 H=5.216 KL=0.0218 clip_frac=0.483
[PPO] it=  151 steps=  618496 avg10=-114.72 loss=4.219 pg=0.005 vf=8.438 H=5.194 KL=0.0128 clip_frac=0.344
[PPO] it=  154 steps=  630784 avg10=-113.57 loss=4.428 pg=0.009 vf=8.848 H=5.186 KL=0.0164 clip_frac=0.394
[PPO] it=  157 steps=  643072 avg10=-113.77 loss=3.895 pg=0.010 vf=7.780 H=5.216 KL=0.0196 clip_frac=0.431
[PPO] it=  160 steps=  655360 avg10=-114.84 loss=3.900 pg=0.011 vf=7.789 H=5.230 KL=0.0204 clip_frac=0.467
[PPO] it=  163 steps=  667648 avg10=-112.56 loss=2.598 pg=0.008 vf=5.190 H=5.239 KL=0.0199 clip_frac=0.457
[PPO] it=  166 steps=  679936 avg10=-118.11 loss=5.355 pg=0.006 vf=10.709 H=5.246 KL=0.0150 clip_frac=0.365
[PPO] it=  169 steps=  692224 avg10=-113.93 loss=3.504 pg=0.010 vf=6.999 H=5.220 KL=0.0200 clip_frac=0.453
[PPO] it=  172 steps=  704512 avg10=-115.83 loss=3.271 pg=0.006 vf=6.540 H=5.217 KL=0.0152 clip_frac=0.354
[PPO] it=  175 steps=  716800 avg10=-118.01 loss=2.815 pg=0.016 vf=5.609 H=5.219 KL=0.0278 clip_frac=0.520
[PPO] it=  178 steps=  729088 avg10=-115.83 loss=6.586 pg=0.012 vf=13.159 H=5.209 KL=0.0208 clip_frac=0.450
[PPO] it=  181 steps=  741376 avg10=-118.56 loss=3.396 pg=0.014 vf=6.774 H=5.202 KL=0.0210 clip_frac=0.479
[PPO] it=  184 steps=  753664 avg10=-116.89 loss=5.169 pg=0.007 vf=10.334 H=5.194 KL=0.0166 clip_frac=0.403
[PPO] it=  187 steps=  765952 avg10=-115.93 loss=2.466 pg=0.010 vf=4.922 H=5.191 KL=0.0262 clip_frac=0.508
[PPO] it=  190 steps=  778240 avg10=-117.74 loss=3.333 pg=0.009 vf=6.659 H=5.208 KL=0.0155 clip_frac=0.403
[PPO] it=  193 steps=  790528 avg10=-118.89 loss=5.693 pg=0.006 vf=11.385 H=5.202 KL=0.0150 clip_frac=0.387
[PPO] it=  196 steps=  802816 avg10=-115.85 loss=3.750 pg=0.008 vf=7.493 H=5.221 KL=0.0221 clip_frac=0.447
[PPO] it=  199 steps=  815104 avg10=-119.68 loss=7.448 pg=0.008 vf=14.891 H=5.225 KL=0.0154 clip_frac=0.347
[PPO] it=  202 steps=  827392 avg10=-115.35 loss=3.100 pg=0.007 vf=6.198 H=5.257 KL=0.0167 clip_frac=0.398
[PPO] it=  205 steps=  839680 avg10=-114.38 loss=3.170 pg=0.007 vf=6.336 H=5.267 KL=0.0200 clip_frac=0.436
[PPO] it=  208 steps=  851968 avg10=-118.86 loss=3.726 pg=0.008 vf=7.447 H=5.272 KL=0.0159 clip_frac=0.401
[PPO] it=  211 steps=  864256 avg10=-116.44 loss=4.945 pg=0.008 vf=9.883 H=5.292 KL=0.0170 clip_frac=0.425
[PPO] it=  214 steps=  876544 avg10=-119.29 loss=4.738 pg=0.009 vf=9.469 H=5.299 KL=0.0157 clip_frac=0.394
[PPO] it=  217 steps=  888832 avg10=-119.97 loss=3.179 pg=0.008 vf=6.351 H=5.288 KL=0.0178 clip_frac=0.431
[PPO] it=  220 steps=  900000 avg10=-118.67 loss=2.611 pg=0.020 vf=5.193 H=5.295 KL=0.0239 clip_frac=0.530
[PPO] done steps=900000 time=1457.5s avg10=-118.67
Saved BipedalWalker PPO run 13 model to a3_bonus_ppo_artifacts/bipedal_walker/run_13_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_13_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -126.06 steps 106
Eval episode 2 seed 1228 return -125.34 steps 103
Eval episode 3 seed 1229 return -125.60 steps 101
Eval episode 4 seed 1230 return -125.32 steps 102
Eval episode 5 seed 1231 return -125.53 steps 101
Eval episode 6 seed 1232 return -125.89 steps 104
Eval episode 7 seed 1233 return -125.32 steps 101
Eval episode 8 seed 1234 return -125.12 steps 102
Eval episode 9 seed 1235 return -126.05 steps 103
Eval episode 10 seed 1236 return -125.63 steps 103
Greedy evaluation mean -125.59  std 0.31
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_13_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_13_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=8, seed=1234, return=-125.12, steps=102
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_13_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -125.12 steps 102 with seed 1234 into a3_bonus_ppo_artifacts/bipedal_walker/run_13_bipedal_ppo/videos
Replayed best episode for video: return=-125.12, steps=102

Run#14

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config
bipedal_cfg_run14 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run14 = build_ppo_continuous_model_from_config(bipedal_cfg_run14).to(device)

# PPO hyperparameters (slightly looser clip, still conservative)
bipedal_ppo_cfg_run14 = PPOUpdateConfig(
    clip_range=0.12,
    value_coef=0.5,
    entropy_coef=0.001,
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run14 = "run_14_bipedal_ppo"
bipedal_run_dir_run14 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run14)
print(f"BipedalWalker PPO run 14 dir: {bipedal_run_dir_run14}")

#  Training budget (very long)
bipedal_total_steps_run14 = 1_000_000
bipedal_rollout_len_run14 = 4096

# Train PPO on BipedalWalker
bipedal_model_run14, bipedal_episode_returns_run14, bipedal_logs_run14 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run14,
    control_type="continuous",
    run_dir=bipedal_run_dir_run14,
    total_env_steps=bipedal_total_steps_run14,
    rollout_len=bipedal_rollout_len_run14,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run14,
    lr=1.5e-4,           # between run 12 and run 13
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run14, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run14, dtype=np.float32),
)

bipedal_model_path_run14 = os.path.join(bipedal_run_dir_run14, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run14.state_dict(), bipedal_model_path_run14)
print(f"Saved BipedalWalker PPO run 14 model to {bipedal_model_path_run14}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run14,
    run_dir=bipedal_run_dir_run14,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 14)",
    ma_window=20,
)

# Greedy evaluation with CSV logging
csv_path_bipedal_run14 = os.path.join(bipedal_run_dir_run14, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run14 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run14,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run14,
)

np.save(
    os.path.join(bipedal_run_dir_run14, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run14, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run14,
    run_dir=bipedal_run_dir_run14,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 14)",
    ma_window=3,
)

# Record best greedy video
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run14,
    control_type="continuous",
    run_dir=bipedal_run_dir_run14,
    csv_path=csv_path_bipedal_run14,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 14 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_14_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-107.93 loss=80.777 pg=0.001 vf=161.564 H=5.673 KL=0.0074 clip_frac=0.127
[PPO] it=    4 steps=   16384 avg10=-113.02 loss=60.829 pg=0.005 vf=121.658 H=5.660 KL=0.0096 clip_frac=0.178
[PPO] it=    7 steps=   28672 avg10=-108.71 loss=84.002 pg=0.004 vf=168.008 H=5.667 KL=0.0089 clip_frac=0.162
[PPO] it=   10 steps=   40960 avg10=-107.70 loss=34.250 pg=0.007 vf=68.498 H=5.672 KL=0.0124 clip_frac=0.270
[PPO] it=   13 steps=   53248 avg10=-107.42 loss=37.713 pg=0.006 vf=75.426 H=5.670 KL=0.0120 clip_frac=0.258
[PPO] it=   16 steps=   65536 avg10=-116.35 loss=45.051 pg=0.006 vf=90.100 H=5.694 KL=0.0145 clip_frac=0.303
[PPO] it=   19 steps=   77824 avg10=-112.61 loss=8.104 pg=0.004 vf=16.210 H=5.680 KL=0.0129 clip_frac=0.260
[PPO] it=   22 steps=   90112 avg10=-109.29 loss=34.259 pg=0.006 vf=68.517 H=5.672 KL=0.0150 clip_frac=0.315
[PPO] it=   25 steps=  102400 avg10=-116.63 loss=28.730 pg=0.005 vf=57.461 H=5.683 KL=0.0112 clip_frac=0.259
[PPO] it=   28 steps=  114688 avg10=-109.80 loss=14.190 pg=0.009 vf=28.374 H=5.675 KL=0.0198 clip_frac=0.404
[PPO] it=   31 steps=  126976 avg10=-120.19 loss=0.429 pg=0.009 vf=0.853 H=5.691 KL=0.0198 clip_frac=0.391
[PPO] it=   34 steps=  139264 avg10=-117.15 loss=16.715 pg=0.004 vf=33.432 H=5.678 KL=0.0132 clip_frac=0.258
[PPO] it=   37 steps=  151552 avg10=-113.46 loss=1.091 pg=0.012 vf=2.169 H=5.710 KL=0.0233 clip_frac=0.449
[PPO] it=   40 steps=  163840 avg10=-122.98 loss=10.371 pg=0.007 vf=20.739 H=5.706 KL=0.0134 clip_frac=0.250
[PPO] it=   43 steps=  176128 avg10=-122.54 loss=0.523 pg=0.007 vf=1.043 H=5.702 KL=0.0185 clip_frac=0.369
[PPO] it=   46 steps=  188416 avg10=-117.64 loss=13.270 pg=0.003 vf=26.546 H=5.708 KL=0.0088 clip_frac=0.150
[PPO] it=   49 steps=  200704 avg10=-114.92 loss=10.488 pg=0.003 vf=20.982 H=5.690 KL=0.0097 clip_frac=0.193
[PPO] it=   52 steps=  212992 avg10=-111.99 loss=28.207 pg=0.003 vf=56.419 H=5.683 KL=0.0105 clip_frac=0.162
[PPO] it=   55 steps=  225280 avg10=-114.95 loss=0.416 pg=0.005 vf=0.831 H=5.677 KL=0.0149 clip_frac=0.322
[PPO] it=   58 steps=  237568 avg10=-115.95 loss=9.989 pg=0.001 vf=19.986 H=5.649 KL=0.0109 clip_frac=0.218
[PPO] it=   61 steps=  249856 avg10=-112.46 loss=0.296 pg=0.004 vf=0.596 H=5.658 KL=0.0121 clip_frac=0.249
[PPO] it=   64 steps=  262144 avg10=-110.55 loss=0.297 pg=0.003 vf=0.600 H=5.671 KL=0.0129 clip_frac=0.257
[PPO] it=   67 steps=  274432 avg10=-107.00 loss=9.444 pg=0.003 vf=18.893 H=5.662 KL=0.0107 clip_frac=0.195
[PPO] it=   70 steps=  286720 avg10=-103.86 loss=0.197 pg=0.003 vf=0.400 H=5.692 KL=0.0157 clip_frac=0.324
[PPO] it=   73 steps=  299008 avg10=-106.62 loss=0.109 pg=0.005 vf=0.219 H=5.682 KL=0.0106 clip_frac=0.220
[PPO] it=   76 steps=  311296 avg10=-100.35 loss=0.240 pg=0.006 vf=0.480 H=5.695 KL=0.0127 clip_frac=0.239
[PPO] it=   79 steps=  323584 avg10=-100.90 loss=0.317 pg=0.003 vf=0.641 H=5.680 KL=0.0116 clip_frac=0.223
[PPO] it=   82 steps=  335872 avg10=-101.38 loss=19.931 pg=0.002 vf=39.869 H=5.684 KL=0.0067 clip_frac=0.093
[PPO] it=   85 steps=  348160 avg10=-102.42 loss=14.520 pg=0.002 vf=29.048 H=5.670 KL=0.0090 clip_frac=0.153
[PPO] it=   88 steps=  360448 avg10=-100.01 loss=0.452 pg=0.003 vf=0.910 H=5.671 KL=0.0116 clip_frac=0.246
[PPO] it=   91 steps=  372736 avg10= -97.42 loss=19.375 pg=0.011 vf=38.739 H=5.672 KL=0.0142 clip_frac=0.188
[PPO] it=   94 steps=  385024 avg10= -90.99 loss=0.200 pg=0.005 vf=0.402 H=5.676 KL=0.0125 clip_frac=0.252
[PPO] it=   97 steps=  397312 avg10= -96.64 loss=11.166 pg=0.005 vf=22.334 H=5.663 KL=0.0096 clip_frac=0.176
[PPO] it=  100 steps=  409600 avg10= -89.93 loss=0.332 pg=0.004 vf=0.668 H=5.647 KL=0.0101 clip_frac=0.210
[PPO] it=  103 steps=  421888 avg10= -92.42 loss=32.567 pg=0.004 vf=65.139 H=5.635 KL=0.0089 clip_frac=0.136
[PPO] it=  106 steps=  434176 avg10= -91.71 loss=6.026 pg=0.001 vf=12.060 H=5.646 KL=0.0095 clip_frac=0.148
[PPO] it=  109 steps=  446464 avg10= -87.79 loss=0.374 pg=0.002 vf=0.755 H=5.664 KL=0.0113 clip_frac=0.211
[PPO] it=  112 steps=  458752 avg10= -95.85 loss=0.891 pg=0.002 vf=1.790 H=5.669 KL=0.0111 clip_frac=0.237
[PPO] it=  115 steps=  471040 avg10= -87.09 loss=0.126 pg=0.005 vf=0.254 H=5.659 KL=0.0108 clip_frac=0.239
[PPO] it=  118 steps=  483328 avg10= -87.71 loss=12.559 pg=-0.002 vf=25.132 H=5.653 KL=0.0091 clip_frac=0.138
[PPO] it=  121 steps=  495616 avg10= -93.10 loss=2.560 pg=0.001 vf=5.129 H=5.649 KL=0.0090 clip_frac=0.133
[PPO] it=  124 steps=  507904 avg10= -90.85 loss=0.271 pg=0.004 vf=0.546 H=5.654 KL=0.0098 clip_frac=0.195
[PPO] it=  127 steps=  520192 avg10= -83.56 loss=0.230 pg=0.007 vf=0.458 H=5.660 KL=0.0143 clip_frac=0.316
[PPO] it=  130 steps=  532480 avg10= -88.52 loss=12.852 pg=0.001 vf=25.713 H=5.657 KL=0.0066 clip_frac=0.076
[PPO] it=  133 steps=  544768 avg10= -92.10 loss=0.094 pg=0.005 vf=0.189 H=5.643 KL=0.0113 clip_frac=0.224
[PPO] it=  136 steps=  557056 avg10= -89.76 loss=16.941 pg=0.003 vf=33.887 H=5.642 KL=0.0091 clip_frac=0.153
[PPO] it=  139 steps=  569344 avg10= -84.46 loss=16.707 pg=0.001 vf=33.422 H=5.641 KL=0.0092 clip_frac=0.157
[PPO] it=  142 steps=  581632 avg10= -83.38 loss=0.248 pg=0.002 vf=0.502 H=5.643 KL=0.0103 clip_frac=0.195
[PPO] it=  145 steps=  593920 avg10= -93.88 loss=10.001 pg=0.004 vf=20.006 H=5.639 KL=0.0089 clip_frac=0.127
[PPO] it=  148 steps=  606208 avg10= -91.27 loss=5.759 pg=0.002 vf=11.524 H=5.633 KL=0.0079 clip_frac=0.144
[PPO] it=  151 steps=  618496 avg10= -97.82 loss=23.226 pg=0.002 vf=46.460 H=5.634 KL=0.0061 clip_frac=0.074
[PPO] it=  154 steps=  630784 avg10= -85.53 loss=7.388 pg=0.005 vf=14.777 H=5.636 KL=0.0117 clip_frac=0.211
[PPO] it=  157 steps=  643072 avg10= -83.14 loss=4.257 pg=0.002 vf=8.521 H=5.631 KL=0.0120 clip_frac=0.213
[PPO] it=  160 steps=  655360 avg10= -98.48 loss=41.855 pg=0.002 vf=83.718 H=5.632 KL=0.0081 clip_frac=0.094
[PPO] it=  163 steps=  667648 avg10=-107.25 loss=19.051 pg=0.007 vf=38.101 H=5.637 KL=0.0115 clip_frac=0.213
[PPO] it=  166 steps=  679936 avg10=-105.68 loss=15.139 pg=0.010 vf=30.270 H=5.631 KL=0.0172 clip_frac=0.350
[PPO] it=  169 steps=  692224 avg10=-106.93 loss=18.757 pg=0.005 vf=37.516 H=5.630 KL=0.0129 clip_frac=0.246
[PPO] it=  172 steps=  704512 avg10=-112.97 loss=12.979 pg=0.005 vf=25.959 H=5.618 KL=0.0153 clip_frac=0.297
[PPO] it=  175 steps=  716800 avg10=-117.51 loss=18.766 pg=0.007 vf=37.528 H=5.643 KL=0.0123 clip_frac=0.260
[PPO] it=  178 steps=  729088 avg10=-118.83 loss=40.569 pg=0.003 vf=81.144 H=5.652 KL=0.0107 clip_frac=0.171
[PPO] it=  181 steps=  741376 avg10=-114.61 loss=23.654 pg=0.003 vf=47.314 H=5.650 KL=0.0123 clip_frac=0.228
[PPO] it=  184 steps=  753664 avg10=-112.89 loss=17.749 pg=0.003 vf=35.504 H=5.646 KL=0.0124 clip_frac=0.234
[PPO] it=  187 steps=  765952 avg10=-100.40 loss=14.269 pg=0.005 vf=28.538 H=5.638 KL=0.0120 clip_frac=0.247
[PPO] it=  190 steps=  778240 avg10= -96.32 loss=5.147 pg=0.009 vf=10.288 H=5.622 KL=0.0144 clip_frac=0.291
[PPO] it=  193 steps=  790528 avg10=-103.08 loss=16.809 pg=0.003 vf=33.625 H=5.619 KL=0.0127 clip_frac=0.235
[PPO] it=  196 steps=  802816 avg10= -92.07 loss=17.369 pg=0.007 vf=34.735 H=5.615 KL=0.0136 clip_frac=0.266
[PPO] it=  199 steps=  815104 avg10=-103.89 loss=14.303 pg=0.004 vf=28.610 H=5.604 KL=0.0124 clip_frac=0.242
[PPO] it=  202 steps=  827392 avg10= -88.25 loss=6.154 pg=0.008 vf=12.303 H=5.585 KL=0.0161 clip_frac=0.328
[PPO] it=  205 steps=  839680 avg10= -92.87 loss=1.004 pg=0.006 vf=2.009 H=5.611 KL=0.0109 clip_frac=0.253
[PPO] it=  208 steps=  851968 avg10= -93.28 loss=1.124 pg=0.008 vf=2.243 H=5.630 KL=0.0153 clip_frac=0.334
[PPO] it=  211 steps=  864256 avg10= -76.14 loss=0.563 pg=0.004 vf=1.130 H=5.638 KL=0.0118 clip_frac=0.247
[PPO] it=  214 steps=  876544 avg10= -95.04 loss=1.269 pg=0.006 vf=2.536 H=5.640 KL=0.0137 clip_frac=0.273
[PPO] it=  217 steps=  888832 avg10=-101.65 loss=4.490 pg=0.004 vf=8.984 H=5.630 KL=0.0107 clip_frac=0.179
[PPO] it=  220 steps=  901120 avg10=-116.86 loss=22.307 pg=0.001 vf=44.624 H=5.624 KL=0.0079 clip_frac=0.115
[PPO] it=  223 steps=  913408 avg10= -98.96 loss=18.230 pg=0.002 vf=36.466 H=5.619 KL=0.0092 clip_frac=0.159
[PPO] it=  226 steps=  925696 avg10=-110.51 loss=31.357 pg=0.003 vf=62.719 H=5.595 KL=0.0097 clip_frac=0.148
[PPO] it=  229 steps=  937984 avg10=-119.13 loss=30.929 pg=0.000 vf=61.869 H=5.599 KL=0.0081 clip_frac=0.111
[PPO] it=  232 steps=  950272 avg10=-108.25 loss=32.731 pg=0.002 vf=65.470 H=5.596 KL=0.0091 clip_frac=0.143
[PPO] it=  235 steps=  962560 avg10=-115.94 loss=25.046 pg=0.003 vf=50.097 H=5.592 KL=0.0111 clip_frac=0.200
[PPO] it=  238 steps=  974848 avg10=-100.83 loss=0.472 pg=0.006 vf=0.942 H=5.595 KL=0.0128 clip_frac=0.255
[PPO] it=  241 steps=  987136 avg10=-103.41 loss=11.641 pg=0.003 vf=23.287 H=5.594 KL=0.0097 clip_frac=0.160
[PPO] it=  244 steps=  999424 avg10= -97.13 loss=0.201 pg=0.005 vf=0.404 H=5.557 KL=0.0131 clip_frac=0.270
[PPO] it=  245 steps= 1000000 avg10= -97.13 loss=0.077 pg=0.001 vf=0.163 H=5.552 KL=0.0096 clip_frac=0.184
[PPO] done steps=1000000 time=1556.1s avg10=-97.13
Saved BipedalWalker PPO run 14 model to a3_bonus_ppo_artifacts/bipedal_walker/run_14_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_14_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -85.62 steps 1600
Eval episode 2 seed 1228 return -85.88 steps 1600
Eval episode 3 seed 1229 return -85.64 steps 1600
Eval episode 4 seed 1230 return -85.68 steps 1600
Eval episode 5 seed 1231 return -85.64 steps 1600
Eval episode 6 seed 1232 return -82.64 steps 1600
Eval episode 7 seed 1233 return -85.64 steps 1600
Eval episode 8 seed 1234 return -85.83 steps 1600
Eval episode 9 seed 1235 return -85.65 steps 1600
Eval episode 10 seed 1236 return -85.46 steps 1600
Greedy evaluation mean -85.37  std 0.92
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_14_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_14_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=6, seed=1232, return=-82.64, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_14_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -82.64 steps 1600 with seed 1232 into a3_bonus_ppo_artifacts/bipedal_walker/run_14_bipedal_ppo/videos
Replayed best episode for video: return=-82.64, steps=1600

Run#15

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config
bipedal_cfg_run15 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run15 = build_ppo_continuous_model_from_config(bipedal_cfg_run15).to(device)

# PPO hyperparameters (slightly wider clip + higher entropy)
bipedal_ppo_cfg_run15 = PPOUpdateConfig(
    clip_range=0.15,
    value_coef=0.5,
    entropy_coef=0.003,   # a bit more exploration than run 12
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run15 = "run_15_bipedal_ppo"
bipedal_run_dir_run15 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run15)
print(f"BipedalWalker PPO run 15 dir: {bipedal_run_dir_run15}")

# Training budget
bipedal_total_steps_run15 = 900_000
bipedal_rollout_len_run15 = 4096

# Train PPO on BipedalWalker
bipedal_model_run15, bipedal_episode_returns_run15, bipedal_logs_run15 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run15,
    control_type="continuous",
    run_dir=bipedal_run_dir_run15,
    total_env_steps=bipedal_total_steps_run15,
    rollout_len=bipedal_rollout_len_run15,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run15,
    lr=2e-4,
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run15, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run15, dtype=np.float32),
)

bipedal_model_path_run15 = os.path.join(bipedal_run_dir_run15, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run15.state_dict(), bipedal_model_path_run15)
print(f"Saved BipedalWalker PPO run 15 model to {bipedal_model_path_run15}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run15,
    run_dir=bipedal_run_dir_run15,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 15)",
    ma_window=20,
)

# Greedy evaluation + CSV
csv_path_bipedal_run15 = os.path.join(bipedal_run_dir_run15, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run15 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run15,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run15,
)

np.save(
    os.path.join(bipedal_run_dir_run15, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run15, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run15,
    run_dir=bipedal_run_dir_run15,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 15)",
    ma_window=3,
)

# Record best eval video
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run15,
    control_type="continuous",
    run_dir=bipedal_run_dir_run15,
    csv_path=csv_path_bipedal_run15,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 15 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_15_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-119.93 loss=173.294 pg=0.004 vf=346.614 H=5.678 KL=0.0106 clip_frac=0.140
[PPO] it=    4 steps=   16384 avg10=-110.51 loss=145.195 pg=0.007 vf=290.411 H=5.675 KL=0.0124 clip_frac=0.161
[PPO] it=    7 steps=   28672 avg10=-109.88 loss=1.427 pg=0.004 vf=2.880 H=5.665 KL=0.0118 clip_frac=0.190
[PPO] it=   10 steps=   40960 avg10=-111.52 loss=28.749 pg=0.002 vf=57.527 H=5.647 KL=0.0083 clip_frac=0.073
[PPO] it=   13 steps=   53248 avg10=-113.34 loss=1.121 pg=0.007 vf=2.262 H=5.636 KL=0.0145 clip_frac=0.217
[PPO] it=   16 steps=   65536 avg10=-108.98 loss=32.771 pg=-0.000 vf=65.576 H=5.616 KL=0.0080 clip_frac=0.046
[PPO] it=   19 steps=   77824 avg10=-109.70 loss=5.847 pg=0.004 vf=11.721 H=5.608 KL=0.0107 clip_frac=0.109
[PPO] it=   22 steps=   90112 avg10=-108.35 loss=12.381 pg=0.003 vf=24.790 H=5.613 KL=0.0107 clip_frac=0.110
[PPO] it=   25 steps=  102400 avg10=-111.57 loss=16.108 pg=0.001 vf=32.249 H=5.624 KL=0.0126 clip_frac=0.114
[PPO] it=   28 steps=  114688 avg10=-112.00 loss=8.492 pg=0.008 vf=17.002 H=5.652 KL=0.0142 clip_frac=0.186
[PPO] it=   31 steps=  126976 avg10=-112.52 loss=14.885 pg=0.004 vf=29.796 H=5.650 KL=0.0114 clip_frac=0.111
[PPO] it=   34 steps=  139264 avg10=-113.15 loss=0.167 pg=0.002 vf=0.363 H=5.626 KL=0.0126 clip_frac=0.176
[PPO] it=   37 steps=  151552 avg10=-112.59 loss=0.212 pg=0.000 vf=0.458 H=5.632 KL=0.0134 clip_frac=0.146
[PPO] it=   40 steps=  163840 avg10=-110.66 loss=27.154 pg=0.000 vf=54.341 H=5.619 KL=0.0085 clip_frac=0.059
[PPO] it=   43 steps=  176128 avg10=-112.13 loss=27.639 pg=-0.000 vf=55.312 H=5.622 KL=0.0079 clip_frac=0.052
[PPO] it=   46 steps=  188416 avg10=-110.06 loss=0.409 pg=0.003 vf=0.846 H=5.606 KL=0.0122 clip_frac=0.165
[PPO] it=   49 steps=  200704 avg10=-115.33 loss=0.168 pg=0.001 vf=0.368 H=5.578 KL=0.0111 clip_frac=0.141
[PPO] it=   52 steps=  212992 avg10=-105.21 loss=0.223 pg=-0.000 vf=0.480 H=5.527 KL=0.0097 clip_frac=0.116
[PPO] it=   55 steps=  225280 avg10= -96.86 loss=0.067 pg=0.001 vf=0.166 H=5.486 KL=0.0086 clip_frac=0.106
[PPO] it=   58 steps=  237568 avg10= -85.44 loss=15.494 pg=0.011 vf=30.999 H=5.458 KL=0.0119 clip_frac=0.078
[PPO] it=   61 steps=  249856 avg10= -91.29 loss=0.205 pg=0.004 vf=0.434 H=5.467 KL=0.0112 clip_frac=0.140
[PPO] it=   64 steps=  262144 avg10= -85.29 loss=0.364 pg=0.004 vf=0.752 H=5.445 KL=0.0116 clip_frac=0.176
[PPO] it=   67 steps=  274432 avg10= -89.28 loss=19.276 pg=0.007 vf=38.572 H=5.440 KL=0.0153 clip_frac=0.106
[PPO] it=   70 steps=  286720 avg10= -94.65 loss=18.980 pg=0.003 vf=37.986 H=5.440 KL=0.0094 clip_frac=0.084
[PPO] it=   73 steps=  299008 avg10= -81.13 loss=0.875 pg=0.009 vf=1.766 H=5.437 KL=0.0179 clip_frac=0.269
[PPO] it=   76 steps=  311296 avg10=-128.04 loss=36.536 pg=0.001 vf=73.102 H=5.440 KL=0.0105 clip_frac=0.112
[PPO] it=   79 steps=  323584 avg10=-111.84 loss=34.230 pg=0.004 vf=68.486 H=5.446 KL=0.0122 clip_frac=0.105
[PPO] it=   82 steps=  335872 avg10=-140.73 loss=22.898 pg=0.003 vf=45.824 H=5.453 KL=0.0125 clip_frac=0.161
[PPO] it=   85 steps=  348160 avg10=-130.28 loss=9.153 pg=0.009 vf=18.322 H=5.466 KL=0.0165 clip_frac=0.176
[PPO] it=   88 steps=  360448 avg10=-115.41 loss=7.267 pg=0.009 vf=14.549 H=5.479 KL=0.0183 clip_frac=0.227
[PPO] it=   91 steps=  372736 avg10=-109.78 loss=14.334 pg=0.004 vf=28.693 H=5.506 KL=0.0110 clip_frac=0.130
[PPO] it=   94 steps=  385024 avg10= -87.23 loss=5.842 pg=0.002 vf=11.712 H=5.483 KL=0.0126 clip_frac=0.084
[PPO] it=   97 steps=  397312 avg10= -83.77 loss=0.103 pg=0.001 vf=0.237 H=5.471 KL=0.0098 clip_frac=0.112
[PPO] it=  100 steps=  409600 avg10= -79.30 loss=0.126 pg=-0.001 vf=0.288 H=5.443 KL=0.0106 clip_frac=0.115
[PPO] it=  103 steps=  421888 avg10= -78.11 loss=0.148 pg=0.002 vf=0.325 H=5.421 KL=0.0102 clip_frac=0.105
[PPO] it=  106 steps=  434176 avg10= -78.07 loss=14.256 pg=0.005 vf=28.534 H=5.423 KL=0.0294 clip_frac=0.128
[PPO] it=  109 steps=  446464 avg10= -78.43 loss=0.193 pg=0.000 vf=0.418 H=5.405 KL=0.0088 clip_frac=0.092
[PPO] it=  112 steps=  458752 avg10= -71.62 loss=0.098 pg=-0.000 vf=0.230 H=5.376 KL=0.0086 clip_frac=0.089
[PPO] it=  115 steps=  471040 avg10= -63.62 loss=0.113 pg=-0.001 vf=0.260 H=5.336 KL=0.0095 clip_frac=0.102
[PPO] it=  118 steps=  483328 avg10= -53.89 loss=0.130 pg=-0.000 vf=0.292 H=5.326 KL=0.0106 clip_frac=0.119
[PPO] it=  121 steps=  495616 avg10= -51.51 loss=0.107 pg=0.001 vf=0.245 H=5.325 KL=0.0076 clip_frac=0.076
[PPO] it=  124 steps=  507904 avg10= -41.35 loss=0.294 pg=-0.002 vf=0.624 H=5.314 KL=0.0097 clip_frac=0.097
[PPO] it=  127 steps=  520192 avg10= -38.83 loss=0.224 pg=-0.000 vf=0.482 H=5.295 KL=0.0109 clip_frac=0.129
[PPO] it=  130 steps=  532480 avg10= -19.68 loss=0.539 pg=-0.003 vf=1.116 H=5.259 KL=0.0088 clip_frac=0.094
[PPO] it=  133 steps=  544768 avg10=  -6.69 loss=0.651 pg=0.000 vf=1.332 H=5.204 KL=0.0091 clip_frac=0.068
[PPO] it=  136 steps=  557056 avg10=  -4.57 loss=0.280 pg=0.000 vf=0.590 H=5.195 KL=0.0105 clip_frac=0.123
[PPO] it=  139 steps=  569344 avg10=  -0.64 loss=0.360 pg=-0.000 vf=0.751 H=5.180 KL=0.0092 clip_frac=0.096
[PPO] it=  142 steps=  581632 avg10=  -7.47 loss=0.456 pg=0.003 vf=0.938 H=5.165 KL=0.0095 clip_frac=0.117
[PPO] it=  145 steps=  593920 avg10= -16.34 loss=4.079 pg=0.012 vf=8.164 H=5.134 KL=0.0159 clip_frac=0.174
[PPO] it=  148 steps=  606208 avg10= -15.80 loss=0.522 pg=0.012 vf=1.051 H=5.127 KL=0.0166 clip_frac=0.196
[PPO] it=  151 steps=  618496 avg10=  11.17 loss=8.010 pg=-0.002 vf=16.054 H=5.112 KL=0.0314 clip_frac=0.119
[PPO] it=  154 steps=  630784 avg10=   5.52 loss=6.314 pg=0.012 vf=12.635 H=5.110 KL=0.0135 clip_frac=0.104
[PPO] it=  157 steps=  643072 avg10=   7.67 loss=0.506 pg=0.006 vf=1.031 H=5.104 KL=0.0116 clip_frac=0.135
[PPO] it=  160 steps=  655360 avg10=  11.10 loss=0.536 pg=0.004 vf=1.095 H=5.114 KL=0.0146 clip_frac=0.177
[PPO] it=  163 steps=  667648 avg10= -13.68 loss=13.530 pg=0.011 vf=27.067 H=5.088 KL=0.0164 clip_frac=0.132
[PPO] it=  166 steps=  679936 avg10= -46.69 loss=19.609 pg=0.005 vf=39.238 H=5.094 KL=0.0147 clip_frac=0.139
[PPO] it=  169 steps=  692224 avg10= -41.23 loss=40.210 pg=0.014 vf=80.422 H=5.098 KL=0.0201 clip_frac=0.158
[PPO] it=  172 steps=  704512 avg10= -42.43 loss=13.746 pg=0.009 vf=27.504 H=5.091 KL=0.0126 clip_frac=0.134
[PPO] it=  175 steps=  716800 avg10= -33.56 loss=1.524 pg=0.024 vf=3.030 H=5.083 KL=0.0233 clip_frac=0.302
[PPO] it=  178 steps=  729088 avg10=-107.54 loss=44.910 pg=0.032 vf=89.787 H=5.087 KL=0.0355 clip_frac=0.437
[PPO] it=  181 steps=  741376 avg10= -81.44 loss=21.971 pg=0.018 vf=43.937 H=5.095 KL=0.0296 clip_frac=0.264
[PPO] it=  184 steps=  753664 avg10= -31.63 loss=8.330 pg=0.010 vf=16.670 H=5.098 KL=0.0892 clip_frac=0.178
[PPO] it=  187 steps=  765952 avg10= -25.72 loss=0.355 pg=0.007 vf=0.728 H=5.080 KL=0.0118 clip_frac=0.147
[PPO] it=  190 steps=  778240 avg10=  -8.76 loss=0.467 pg=0.000 vf=0.963 H=5.072 KL=0.0099 clip_frac=0.110
[PPO] it=  193 steps=  790528 avg10= -15.13 loss=13.725 pg=0.013 vf=27.453 H=5.042 KL=0.1638 clip_frac=0.138
[PPO] it=  196 steps=  802816 avg10= -20.22 loss=0.396 pg=0.001 vf=0.820 H=5.033 KL=0.0113 clip_frac=0.116
[PPO] it=  199 steps=  815104 avg10=   3.69 loss=0.439 pg=-0.001 vf=0.911 H=5.049 KL=0.0112 clip_frac=0.134
[PPO] it=  202 steps=  827392 avg10=  12.44 loss=0.305 pg=-0.002 vf=0.645 H=5.031 KL=0.0118 clip_frac=0.129
[PPO] it=  205 steps=  839680 avg10=  19.36 loss=0.328 pg=0.001 vf=0.684 H=5.014 KL=0.0118 clip_frac=0.134
[PPO] it=  208 steps=  851968 avg10=   6.69 loss=6.990 pg=-0.003 vf=14.016 H=5.016 KL=0.3212 clip_frac=0.196
[PPO] it=  211 steps=  864256 avg10=  13.90 loss=0.654 pg=0.001 vf=1.336 H=5.018 KL=0.0128 clip_frac=0.137
[PPO] it=  214 steps=  876544 avg10=  15.40 loss=20.738 pg=-0.002 vf=41.509 H=5.000 KL=0.1271 clip_frac=0.154
[PPO] it=  217 steps=  888832 avg10=  16.46 loss=0.487 pg=-0.000 vf=1.005 H=4.975 KL=0.0114 clip_frac=0.123
[PPO] it=  220 steps=  900000 avg10=   4.07 loss=31.404 pg=0.084 vf=62.670 H=4.960 KL=0.3506 clip_frac=0.416
[PPO] done steps=900000 time=1392.2s avg10=4.07
Saved BipedalWalker PPO run 15 model to a3_bonus_ppo_artifacts/bipedal_walker/run_15_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_15_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -109.69 steps 46
Eval episode 2 seed 1228 return -110.03 steps 46
Eval episode 3 seed 1229 return -109.89 steps 45
Eval episode 4 seed 1230 return -109.90 steps 46
Eval episode 5 seed 1231 return -109.93 steps 45
Eval episode 6 seed 1232 return -109.86 steps 46
Eval episode 7 seed 1233 return -109.80 steps 45
Eval episode 8 seed 1234 return -110.03 steps 46
Eval episode 9 seed 1235 return -111.02 steps 50
Eval episode 10 seed 1236 return -109.95 steps 46
Greedy evaluation mean -110.01  std 0.35
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_15_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_15_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=1, seed=1227, return=-109.69, steps=46
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_15_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -109.69 steps 46 with seed 1227 into a3_bonus_ppo_artifacts/bipedal_walker/run_15_bipedal_ppo/videos
Replayed best episode for video: return=-109.69, steps=46

Run#16

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config
bipedal_cfg_run16 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run16 = build_ppo_continuous_model_from_config(bipedal_cfg_run16).to(device)

# PPO hyperparameters (wider clip, no entropy)
bipedal_ppo_cfg_run16 = PPOUpdateConfig(
    clip_range=0.20,      # classic PPO clip
    value_coef=0.5,
    entropy_coef=0.0,     # pure exploitation once it stabilizes
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run16 = "run_16_bipedal_ppo"
bipedal_run_dir_run16 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run16)
print(f"BipedalWalker PPO run 16 dir: {bipedal_run_dir_run16}")

# Training budget
bipedal_total_steps_run16 = 1_000_000
bipedal_rollout_len_run16 = 4096

# Train PPO on BipedalWalker
bipedal_model_run16, bipedal_episode_returns_run16, bipedal_logs_run16 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run16,
    control_type="continuous",
    run_dir=bipedal_run_dir_run16,
    total_env_steps=bipedal_total_steps_run16,
    rollout_len=bipedal_rollout_len_run16,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run16,
    lr=3e-4,              # most aggressive LR
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run16, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run16, dtype=np.float32),
)

bipedal_model_path_run16 = os.path.join(bipedal_run_dir_run16, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run16.state_dict(), bipedal_model_path_run16)
print(f"Saved BipedalWalker PPO run 16 model to {bipedal_model_path_run16}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run16,
    run_dir=bipedal_run_dir_run16,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 16)",
    ma_window=20,
)

# Greedy evaluation + CSV
csv_path_bipedal_run16 = os.path.join(bipedal_run_dir_run16, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run16 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run16,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run16,
)

np.save(
    os.path.join(bipedal_run_dir_run16, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run16, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run16,
    run_dir=bipedal_run_dir_run16,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 16)",
    ma_window=3,
)

# Record best eval video
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run16,
    control_type="continuous",
    run_dir=bipedal_run_dir_run16,
    csv_path=csv_path_bipedal_run16,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 16 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_16_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-110.02 loss=105.092 pg=0.007 vf=210.170 H=5.661 KL=0.0201 clip_frac=0.162
[PPO] it=    4 steps=   16384 avg10=-112.83 loss=56.096 pg=0.008 vf=112.177 H=5.647 KL=0.0187 clip_frac=0.158
[PPO] it=    7 steps=   28672 avg10=-137.00 loss=24.232 pg=0.005 vf=48.454 H=5.634 KL=0.0199 clip_frac=0.203
[PPO] it=   10 steps=   40960 avg10=-113.35 loss=33.411 pg=0.006 vf=66.810 H=5.657 KL=0.0204 clip_frac=0.163
[PPO] it=   13 steps=   53248 avg10=-110.08 loss=42.888 pg=0.009 vf=85.758 H=5.684 KL=0.0225 clip_frac=0.142
[PPO] it=   16 steps=   65536 avg10=-115.91 loss=36.980 pg=0.005 vf=73.950 H=5.671 KL=0.0171 clip_frac=0.143
[PPO] it=   19 steps=   77824 avg10=-115.19 loss=16.864 pg=0.004 vf=33.719 H=5.658 KL=0.0234 clip_frac=0.208
[PPO] it=   22 steps=   90112 avg10=-105.11 loss=7.706 pg=0.010 vf=15.393 H=5.648 KL=0.0268 clip_frac=0.263
[PPO] it=   25 steps=  102400 avg10=-109.74 loss=8.247 pg=0.007 vf=16.480 H=5.633 KL=0.0214 clip_frac=0.232
[PPO] it=   28 steps=  114688 avg10=-110.66 loss=21.997 pg=0.007 vf=43.980 H=5.593 KL=0.0200 clip_frac=0.197
[PPO] it=   31 steps=  126976 avg10= -98.72 loss=0.982 pg=0.011 vf=1.941 H=5.616 KL=0.0317 clip_frac=0.318
[PPO] it=   34 steps=  139264 avg10=-119.86 loss=13.867 pg=0.005 vf=27.724 H=5.624 KL=0.0217 clip_frac=0.158
[PPO] it=   37 steps=  151552 avg10=-115.45 loss=23.029 pg=0.004 vf=46.049 H=5.622 KL=0.0255 clip_frac=0.208
[PPO] it=   40 steps=  163840 avg10=-104.28 loss=0.436 pg=0.021 vf=0.829 H=5.617 KL=0.0385 clip_frac=0.344
[PPO] it=   43 steps=  176128 avg10= -87.02 loss=0.164 pg=0.004 vf=0.321 H=5.504 KL=0.0210 clip_frac=0.206
[PPO] it=   46 steps=  188416 avg10= -94.77 loss=22.621 pg=-0.006 vf=45.254 H=5.475 KL=0.1086 clip_frac=0.117
[PPO] it=   49 steps=  200704 avg10= -97.20 loss=5.802 pg=-0.016 vf=11.637 H=5.443 KL=1.0441 clip_frac=0.227
[PPO] it=   52 steps=  212992 avg10= -87.60 loss=0.145 pg=0.001 vf=0.287 H=5.459 KL=0.0157 clip_frac=0.149
[PPO] it=   55 steps=  225280 avg10= -83.19 loss=0.137 pg=0.002 vf=0.268 H=5.421 KL=0.0166 clip_frac=0.139
[PPO] it=   58 steps=  237568 avg10= -80.94 loss=0.141 pg=0.002 vf=0.278 H=5.395 KL=0.0170 clip_frac=0.155
[PPO] it=   61 steps=  249856 avg10= -81.89 loss=0.166 pg=-0.000 vf=0.334 H=5.392 KL=0.0131 clip_frac=0.095
[PPO] it=   64 steps=  262144 avg10= -96.28 loss=0.239 pg=0.001 vf=0.477 H=5.385 KL=0.0183 clip_frac=0.154
[PPO] it=   67 steps=  274432 avg10= -79.04 loss=0.098 pg=0.000 vf=0.195 H=5.310 KL=0.0143 clip_frac=0.111
[PPO] it=   70 steps=  286720 avg10= -74.61 loss=0.144 pg=0.001 vf=0.287 H=5.287 KL=0.0143 clip_frac=0.097
[PPO] it=   73 steps=  299008 avg10= -73.00 loss=0.209 pg=0.002 vf=0.415 H=5.277 KL=0.0173 clip_frac=0.123
[PPO] it=   76 steps=  311296 avg10= -97.90 loss=0.367 pg=0.037 vf=0.660 H=5.296 KL=0.0386 clip_frac=0.093
[PPO] it=   79 steps=  323584 avg10= -73.94 loss=0.115 pg=0.000 vf=0.229 H=5.257 KL=0.0145 clip_frac=0.122
[PPO] it=   82 steps=  335872 avg10= -73.03 loss=0.153 pg=-0.001 vf=0.307 H=5.204 KL=0.0172 clip_frac=0.113
[PPO] it=   85 steps=  348160 avg10= -65.13 loss=0.319 pg=-0.001 vf=0.640 H=5.144 KL=0.0160 clip_frac=0.120
[PPO] it=   88 steps=  360448 avg10= -94.99 loss=0.464 pg=0.033 vf=0.862 H=5.112 KL=0.0406 clip_frac=0.131
[PPO] it=   91 steps=  372736 avg10= -43.60 loss=0.399 pg=-0.002 vf=0.802 H=5.088 KL=0.0171 clip_frac=0.118
[PPO] it=   94 steps=  385024 avg10= -31.57 loss=0.270 pg=-0.002 vf=0.544 H=5.027 KL=0.0173 clip_frac=0.132
[PPO] it=   97 steps=  397312 avg10= -23.23 loss=0.363 pg=0.000 vf=0.727 H=4.964 KL=0.0205 clip_frac=0.146
[PPO] it=  100 steps=  409600 avg10= -87.03 loss=0.716 pg=0.044 vf=1.342 H=4.923 KL=0.0627 clip_frac=0.188
[PPO] it=  103 steps=  421888 avg10= -10.49 loss=0.539 pg=-0.002 vf=1.083 H=4.885 KL=0.0195 clip_frac=0.129
[PPO] it=  106 steps=  434176 avg10=  -6.57 loss=0.552 pg=-0.004 vf=1.113 H=4.832 KL=0.0173 clip_frac=0.144
[PPO] it=  109 steps=  446464 avg10=   1.79 loss=0.560 pg=-0.003 vf=1.125 H=4.808 KL=0.0159 clip_frac=0.120
[PPO] it=  112 steps=  458752 avg10= -65.79 loss=0.619 pg=0.037 vf=1.164 H=4.789 KL=0.1230 clip_frac=0.137
[PPO] it=  115 steps=  471040 avg10=   6.53 loss=0.523 pg=-0.004 vf=1.054 H=4.758 KL=0.0332 clip_frac=0.144
[PPO] it=  118 steps=  483328 avg10=  19.95 loss=0.573 pg=-0.003 vf=1.151 H=4.718 KL=0.0217 clip_frac=0.112
[PPO] it=  121 steps=  495616 avg10=  32.62 loss=0.471 pg=-0.001 vf=0.944 H=4.693 KL=0.0179 clip_frac=0.142
[PPO] it=  124 steps=  507904 avg10= -25.34 loss=0.575 pg=0.000 vf=1.149 H=4.672 KL=0.0223 clip_frac=0.183
[PPO] it=  127 steps=  520192 avg10=  38.71 loss=0.700 pg=0.004 vf=1.392 H=4.640 KL=0.0235 clip_frac=0.166
[PPO] it=  130 steps=  532480 avg10=  43.37 loss=0.719 pg=-0.003 vf=1.445 H=4.583 KL=0.0173 clip_frac=0.149
[PPO] it=  133 steps=  544768 avg10= -97.13 loss=8.975 pg=-0.074 vf=18.099 H=4.535 KL=23.6693 clip_frac=0.474
[PPO] it=  136 steps=  557056 avg10=  37.31 loss=0.824 pg=-0.001 vf=1.649 H=4.476 KL=0.0235 clip_frac=0.169
[PPO] it=  139 steps=  569344 avg10=  68.52 loss=0.560 pg=-0.001 vf=1.123 H=4.425 KL=0.0222 clip_frac=0.180
[PPO] it=  142 steps=  581632 avg10=  78.15 loss=0.564 pg=-0.001 vf=1.128 H=4.367 KL=0.0221 clip_frac=0.195
[PPO] it=  145 steps=  593920 avg10= -18.70 loss=1.109 pg=-0.005 vf=2.229 H=4.327 KL=0.0296 clip_frac=0.202
[PPO] it=  148 steps=  606208 avg10=  84.58 loss=0.771 pg=-0.001 vf=1.544 H=4.307 KL=0.0304 clip_frac=0.216
[PPO] it=  151 steps=  618496 avg10=  91.81 loss=0.676 pg=0.001 vf=1.350 H=4.310 KL=0.0235 clip_frac=0.196
[PPO] it=  154 steps=  630784 avg10= -71.39 loss=1.181 pg=0.033 vf=2.295 H=4.299 KL=1.9981 clip_frac=0.272
[PPO] it=  157 steps=  643072 avg10=  83.15 loss=0.638 pg=-0.005 vf=1.285 H=4.260 KL=0.0253 clip_frac=0.179
[PPO] it=  160 steps=  655360 avg10= 112.31 loss=0.855 pg=-0.000 vf=1.710 H=4.235 KL=0.0234 clip_frac=0.200
[PPO] it=  163 steps=  667648 avg10=-115.05 loss=10.855 pg=-0.055 vf=21.820 H=4.256 KL=32.2020 clip_frac=0.534
[PPO] it=  166 steps=  679936 avg10=  53.07 loss=0.810 pg=0.006 vf=1.607 H=4.258 KL=0.0356 clip_frac=0.233
[PPO] it=  169 steps=  692224 avg10= 127.24 loss=0.635 pg=0.003 vf=1.265 H=4.242 KL=0.0198 clip_frac=0.193
[PPO] it=  172 steps=  704512 avg10=  54.80 loss=9.073 pg=-0.013 vf=18.173 H=4.233 KL=2.7784 clip_frac=0.311
[PPO] it=  175 steps=  716800 avg10=  26.26 loss=1.015 pg=0.002 vf=2.025 H=4.232 KL=0.0270 clip_frac=0.208
[PPO] it=  178 steps=  729088 avg10= 126.87 loss=0.987 pg=0.008 vf=1.958 H=4.261 KL=0.0306 clip_frac=0.224
[PPO] it=  181 steps=  741376 avg10= 103.72 loss=10.850 pg=-0.001 vf=21.703 H=4.267 KL=0.9154 clip_frac=0.266
[PPO] it=  184 steps=  753664 avg10=  32.36 loss=0.903 pg=0.003 vf=1.801 H=4.224 KL=0.0408 clip_frac=0.246
[PPO] it=  187 steps=  765952 avg10= 122.32 loss=1.017 pg=0.001 vf=2.032 H=4.218 KL=0.0289 clip_frac=0.243
[PPO] it=  190 steps=  778240 avg10=-115.06 loss=9.818 pg=-0.041 vf=19.718 H=4.225 KL=21.6874 clip_frac=0.488
[PPO] it=  193 steps=  790528 avg10=  56.34 loss=0.603 pg=0.005 vf=1.196 H=4.206 KL=0.0334 clip_frac=0.269
[PPO] it=  196 steps=  802816 avg10= 134.09 loss=0.940 pg=0.000 vf=1.880 H=4.197 KL=0.0359 clip_frac=0.256
[PPO] it=  199 steps=  815104 avg10= -89.79 loss=0.827 pg=-0.039 vf=1.730 H=4.193 KL=34.2152 clip_frac=0.582
[PPO] it=  202 steps=  827392 avg10=  85.49 loss=1.175 pg=0.011 vf=2.328 H=4.192 KL=0.0373 clip_frac=0.297
[PPO] it=  205 steps=  839680 avg10= 136.11 loss=0.695 pg=0.001 vf=1.389 H=4.207 KL=0.0246 clip_frac=0.218
[PPO] it=  208 steps=  851968 avg10= -38.76 loss=0.962 pg=0.019 vf=1.886 H=4.170 KL=0.2604 clip_frac=0.244
[PPO] it=  211 steps=  864256 avg10= 143.77 loss=0.591 pg=0.004 vf=1.174 H=4.205 KL=0.0261 clip_frac=0.249
[PPO] it=  214 steps=  876544 avg10=-115.04 loss=10.319 pg=-0.045 vf=20.728 H=4.185 KL=41.1528 clip_frac=0.590
[PPO] it=  217 steps=  888832 avg10=  68.46 loss=0.549 pg=0.002 vf=1.093 H=4.153 KL=0.0258 clip_frac=0.233
[PPO] it=  220 steps=  901120 avg10= 153.40 loss=1.164 pg=0.002 vf=2.325 H=4.119 KL=0.0314 clip_frac=0.213
[PPO] it=  223 steps=  913408 avg10=  -8.82 loss=1.035 pg=0.027 vf=2.017 H=4.086 KL=0.2768 clip_frac=0.223
[PPO] it=  226 steps=  925696 avg10= 154.20 loss=0.638 pg=0.000 vf=1.276 H=4.066 KL=0.0640 clip_frac=0.251
[PPO] it=  229 steps=  937984 avg10= -87.07 loss=1.068 pg=-0.020 vf=2.176 H=4.005 KL=11.4779 clip_frac=0.385
[PPO] it=  232 steps=  950272 avg10= 133.36 loss=0.443 pg=0.003 vf=0.881 H=4.007 KL=0.0297 clip_frac=0.257
[PPO] it=  235 steps=  962560 avg10=-115.12 loss=10.631 pg=-0.007 vf=21.276 H=4.006 KL=68.3810 clip_frac=0.748
[PPO] it=  238 steps=  974848 avg10= 105.90 loss=1.087 pg=0.008 vf=2.158 H=3.998 KL=0.0355 clip_frac=0.263
[PPO] it=  241 steps=  987136 avg10=-115.03 loss=10.427 pg=-0.031 vf=20.915 H=4.040 KL=74.6460 clip_frac=0.786
[PPO] it=  244 steps=  999424 avg10=  71.19 loss=0.693 pg=0.003 vf=1.380 H=4.010 KL=0.0701 clip_frac=0.248
[PPO] it=  245 steps= 1000000 avg10=  99.41 loss=4.225 pg=0.004 vf=8.442 H=3.999 KL=0.0999 clip_frac=0.297
[PPO] done steps=1000000 time=1559.1s avg10=99.41
Saved BipedalWalker PPO run 16 model to a3_bonus_ppo_artifacts/bipedal_walker/run_16_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_16_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 102.38 steps 1600
Eval episode 2 seed 1228 return 107.83 steps 1600
Eval episode 3 seed 1229 return 81.60 steps 1600
Eval episode 4 seed 1230 return 99.26 steps 1600
Eval episode 5 seed 1231 return 140.17 steps 1600
Eval episode 6 seed 1232 return 132.24 steps 1600
Eval episode 7 seed 1233 return 56.39 steps 1600
Eval episode 8 seed 1234 return 99.93 steps 1600
Eval episode 9 seed 1235 return 83.97 steps 1600
Eval episode 10 seed 1236 return 95.47 steps 1600
Greedy evaluation mean 99.93  std 22.89
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_16_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_16_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=5, seed=1231, return=140.17, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_16_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 140.17 steps 1600 with seed 1231 into a3_bonus_ppo_artifacts/bipedal_walker/run_16_bipedal_ppo/videos
Replayed best episode for video: return=140.17, steps=1600

Run#17

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config (same 256x256 MLP)
bipedal_cfg_run17 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run17 = build_ppo_continuous_model_from_config(bipedal_cfg_run17).to(device)


bipedal_ppo_cfg_run17 = PPOUpdateConfig(
    clip_range=0.20,      # standard PPO clip
    value_coef=0.5,       # match A2C v2
    entropy_coef=0.01,
    max_grad_norm=0.3,    # match more conservative gradient clipping
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run17 = "run_17_bipedal_ppo"
bipedal_run_dir_run17 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run17)
print(f"BipedalWalker PPO run 17 dir: {bipedal_run_dir_run17}")

# Training budget
bipedal_total_steps_run17 = 600_000
bipedal_rollout_len_run17 = 2048   # slightly shorter rollouts than 4096

# Train PPO on BipedalWalker
bipedal_model_run17, bipedal_episode_returns_run17, bipedal_logs_run17 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run17,
    control_type="continuous",
    run_dir=bipedal_run_dir_run17,
    total_env_steps=bipedal_total_steps_run17,
    rollout_len=bipedal_rollout_len_run17,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run17,
    lr=2.5e-4,
    log_every=20_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run17, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run17, dtype=np.float32),
)

bipedal_model_path_run17 = os.path.join(bipedal_run_dir_run17, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run17.state_dict(), bipedal_model_path_run17)
print(f"Saved BipedalWalker PPO run 17 model to {bipedal_model_path_run17}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run17,
    run_dir=bipedal_run_dir_run17,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 17)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run17 = os.path.join(bipedal_run_dir_run17, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run17 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run17,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run17,
)

# Save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run17, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run17, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run17,
    run_dir=bipedal_run_dir_run17,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 17)",
    ma_window=3,
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run17,
    control_type="continuous",
    run_dir=bipedal_run_dir_run17,
    csv_path=csv_path_bipedal_run17,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 17 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_17_bipedal_ppo
[PPO] it=    1 steps=    2048 avg10=-115.02 loss=242.019 pg=-0.002 vf=484.156 H=5.683 KL=0.0104 clip_frac=0.065
[PPO] it=   11 steps=   22528 avg10=-104.78 loss=56.706 pg=0.002 vf=113.522 H=5.724 KL=0.0135 clip_frac=0.064
[PPO] it=   21 steps=   43008 avg10=-112.16 loss=48.841 pg=-0.001 vf=97.799 H=5.741 KL=0.0129 clip_frac=0.051
[PPO] it=   31 steps=   63488 avg10=-111.69 loss=0.136 pg=0.001 vf=0.386 H=5.744 KL=0.0127 clip_frac=0.119
[PPO] it=   41 steps=   83968 avg10=-108.42 loss=32.241 pg=-0.004 vf=64.606 H=5.739 KL=0.0172 clip_frac=0.115
[PPO] it=   51 steps=  104448 avg10=-119.20 loss=0.208 pg=0.005 vf=0.522 H=5.755 KL=0.0233 clip_frac=0.242
[PPO] it=   61 steps=  124928 avg10=-123.62 loss=0.017 pg=0.002 vf=0.146 H=5.778 KL=0.0148 clip_frac=0.121
[PPO] it=   71 steps=  145408 avg10=-116.31 loss=0.057 pg=0.001 vf=0.226 H=5.751 KL=0.0151 clip_frac=0.111
[PPO] it=   81 steps=  165888 avg10= -99.18 loss=0.150 pg=-0.003 vf=0.421 H=5.757 KL=0.0173 clip_frac=0.148
[PPO] it=   91 steps=  186368 avg10= -77.99 loss=0.159 pg=-0.001 vf=0.434 H=5.742 KL=0.0136 clip_frac=0.079
[PPO] it=  101 steps=  206848 avg10= -63.27 loss=0.175 pg=-0.000 vf=0.467 H=5.785 KL=0.0097 clip_frac=0.059
[PPO] it=  111 steps=  227328 avg10= -49.45 loss=0.071 pg=-0.003 vf=0.263 H=5.711 KL=0.0115 clip_frac=0.081
[PPO] it=  121 steps=  247808 avg10= -33.48 loss=0.314 pg=-0.002 vf=0.746 H=5.685 KL=0.0108 clip_frac=0.096
[PPO] it=  131 steps=  268288 avg10= -40.82 loss=2.837 pg=0.055 vf=5.678 H=5.710 KL=0.0509 clip_frac=0.314
[PPO] it=  141 steps=  288768 avg10=  -5.22 loss=0.338 pg=-0.000 vf=0.790 H=5.651 KL=0.0141 clip_frac=0.089
[PPO] it=  151 steps=  309248 avg10=  11.28 loss=0.386 pg=-0.002 vf=0.887 H=5.600 KL=0.0117 clip_frac=0.073
[PPO] it=  161 steps=  329728 avg10=  23.87 loss=25.361 pg=-0.015 vf=50.864 H=5.546 KL=0.3398 clip_frac=0.203
[PPO] it=  171 steps=  350208 avg10=  48.95 loss=1.001 pg=-0.003 vf=2.120 H=5.569 KL=0.0163 clip_frac=0.145
[PPO] it=  181 steps=  370688 avg10=  41.61 loss=0.613 pg=0.008 vf=1.322 H=5.540 KL=0.0186 clip_frac=0.153
[PPO] it=  191 steps=  391168 avg10=  41.61 loss=2.316 pg=0.014 vf=4.713 H=5.537 KL=0.0349 clip_frac=0.284
[PPO] it=  201 steps=  411648 avg10=  29.37 loss=2.477 pg=0.000 vf=5.064 H=5.581 KL=0.0174 clip_frac=0.155
[PPO] it=  211 steps=  432128 avg10=  88.80 loss=0.599 pg=-0.000 vf=1.311 H=5.635 KL=0.0215 clip_frac=0.192
[PPO] it=  221 steps=  452608 avg10=  60.82 loss=2.822 pg=0.165 vf=5.426 H=5.658 KL=1.4141 clip_frac=0.228
[PPO] it=  231 steps=  473088 avg10=  88.14 loss=1.021 pg=-0.001 vf=2.157 H=5.689 KL=0.0189 clip_frac=0.212
[PPO] it=  241 steps=  493568 avg10= 109.50 loss=20.356 pg=-0.015 vf=40.857 H=5.715 KL=11.7333 clip_frac=0.387
[PPO] it=  251 steps=  514048 avg10= -91.17 loss=0.769 pg=0.014 vf=1.626 H=5.779 KL=0.0402 clip_frac=0.285
[PPO] it=  261 steps=  534528 avg10= -97.25 loss=0.884 pg=0.004 vf=1.878 H=5.852 KL=0.0557 clip_frac=0.194
[PPO] it=  271 steps=  555008 avg10= -79.53 loss=0.257 pg=-0.005 vf=0.642 H=5.908 KL=0.0223 clip_frac=0.214
[PPO] it=  281 steps=  575488 avg10= -49.92 loss=0.265 pg=-0.000 vf=0.649 H=5.967 KL=0.0196 clip_frac=0.186
[PPO] it=  291 steps=  595968 avg10=  -5.32 loss=0.271 pg=0.000 vf=0.661 H=5.953 KL=0.0231 clip_frac=0.167
[PPO] it=  293 steps=  600000 avg10=  -1.64 loss=0.489 pg=-0.001 vf=1.100 H=5.983 KL=0.0167 clip_frac=0.146
[PPO] done steps=600000 time=937.0s avg10=-1.64
Saved BipedalWalker PPO run 17 model to a3_bonus_ppo_artifacts/bipedal_walker/run_17_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_17_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 61.71 steps 1600
Eval episode 2 seed 1228 return 60.39 steps 1600
Eval episode 3 seed 1229 return 64.26 steps 1600
Eval episode 4 seed 1230 return 62.44 steps 1600
Eval episode 5 seed 1231 return 76.25 steps 1600
Eval episode 6 seed 1232 return 43.02 steps 1600
Eval episode 7 seed 1233 return 46.86 steps 1600
Eval episode 8 seed 1234 return 61.37 steps 1600
Eval episode 9 seed 1235 return 60.05 steps 1600
Eval episode 10 seed 1236 return 52.10 steps 1600
Greedy evaluation mean 58.84  std 8.96
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_17_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_17_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=5, seed=1231, return=76.25, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_17_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 76.25 steps 1600 with seed 1231 into a3_bonus_ppo_artifacts/bipedal_walker/run_17_bipedal_ppo/videos
Replayed best episode for video: return=76.25, steps=1600

Run#18

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config (same 256-256 MLP)
bipedal_cfg_run18 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run18 = build_ppo_continuous_model_from_config(bipedal_cfg_run18).to(device)

# PPO hyperparameters (slightly smaller clip + LR than run 17)
bipedal_ppo_cfg_run18 = PPOUpdateConfig(
    clip_range=0.15,      # between 0.10 (run 12) and 0.20 (run 16/17)
    value_coef=0.5,
    entropy_coef=0.003,   # mild exploration
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run18 = "run_18_bipedal_ppo"
bipedal_run_dir_run18 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run18)
print(f"BipedalWalker PPO run 18 dir: {bipedal_run_dir_run18}")

# Training budget (a bit longer than run 17)
bipedal_total_steps_run18 = 800_000
bipedal_rollout_len_run18 = 4096

# Train PPO on BipedalWalker
bipedal_model_run18, bipedal_episode_returns_run18, bipedal_logs_run18 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run18,
    control_type="continuous",
    run_dir=bipedal_run_dir_run18,
    total_env_steps=bipedal_total_steps_run18,
    rollout_len=bipedal_rollout_len_run18,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run18,
    lr=2e-4,              # slightly lower than run 17 (2.5e-4)
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run18, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run18, dtype=np.float32),
)

bipedal_model_path_run18 = os.path.join(bipedal_run_dir_run18, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run18.state_dict(), bipedal_model_path_run18)
print(f"Saved BipedalWalker PPO run 18 model to {bipedal_model_path_run18}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run18,
    run_dir=bipedal_run_dir_run18,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 18)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run18 = os.path.join(bipedal_run_dir_run18, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run18 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run18,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run18,
)

# save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run18, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run18, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run18,
    run_dir=bipedal_run_dir_run18,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 18)",
    ma_window=3,
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run18,
    control_type="continuous",
    run_dir=bipedal_run_dir_run18,
    csv_path=csv_path_bipedal_run18,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 18 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_18_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-106.61 loss=84.735 pg=0.002 vf=169.500 H=5.676 KL=0.0125 clip_frac=0.157
[PPO] it=    4 steps=   16384 avg10=-112.80 loss=56.433 pg=0.007 vf=112.886 H=5.686 KL=0.0135 clip_frac=0.172
[PPO] it=    7 steps=   28672 avg10=-110.56 loss=43.326 pg=0.009 vf=86.669 H=5.709 KL=0.0174 clip_frac=0.245
[PPO] it=   10 steps=   40960 avg10=-114.98 loss=19.504 pg=0.007 vf=39.028 H=5.698 KL=0.0146 clip_frac=0.216
[PPO] it=   13 steps=   53248 avg10=-114.80 loss=32.970 pg=0.008 vf=65.958 H=5.642 KL=0.0132 clip_frac=0.154
[PPO] it=   16 steps=   65536 avg10=-113.57 loss=0.678 pg=0.007 vf=1.377 H=5.645 KL=0.0182 clip_frac=0.276
[PPO] it=   19 steps=   77824 avg10=-114.29 loss=8.951 pg=0.003 vf=17.930 H=5.659 KL=0.0147 clip_frac=0.203
[PPO] it=   22 steps=   90112 avg10=-112.60 loss=10.260 pg=0.008 vf=20.539 H=5.651 KL=0.0120 clip_frac=0.141
[PPO] it=   25 steps=  102400 avg10=-107.07 loss=0.290 pg=0.003 vf=0.609 H=5.669 KL=0.0117 clip_frac=0.171
[PPO] it=   28 steps=  114688 avg10= -98.76 loss=0.058 pg=0.002 vf=0.148 H=5.671 KL=0.0100 clip_frac=0.123
[PPO] it=   31 steps=  126976 avg10= -88.81 loss=12.266 pg=0.052 vf=24.461 H=5.626 KL=0.0155 clip_frac=0.152
[PPO] it=   34 steps=  139264 avg10= -87.89 loss=0.071 pg=0.002 vf=0.172 H=5.609 KL=0.0105 clip_frac=0.130
[PPO] it=   37 steps=  151552 avg10= -86.64 loss=0.067 pg=0.000 vf=0.167 H=5.567 KL=0.0101 clip_frac=0.117
[PPO] it=   40 steps=  163840 avg10= -82.14 loss=0.116 pg=-0.001 vf=0.267 H=5.527 KL=0.0107 clip_frac=0.131
[PPO] it=   43 steps=  176128 avg10= -74.75 loss=0.111 pg=0.002 vf=0.252 H=5.507 KL=0.0085 clip_frac=0.078
[PPO] it=   46 steps=  188416 avg10= -80.34 loss=19.274 pg=0.002 vf=38.576 H=5.493 KL=0.0102 clip_frac=0.110
[PPO] it=   49 steps=  200704 avg10= -85.74 loss=0.190 pg=0.000 vf=0.413 H=5.469 KL=0.0092 clip_frac=0.096
[PPO] it=   52 steps=  212992 avg10= -67.69 loss=0.186 pg=0.001 vf=0.403 H=5.460 KL=0.0113 clip_frac=0.113
[PPO] it=   55 steps=  225280 avg10= -67.78 loss=0.211 pg=-0.001 vf=0.458 H=5.441 KL=0.0093 clip_frac=0.106
[PPO] it=   58 steps=  237568 avg10= -62.59 loss=0.118 pg=-0.001 vf=0.270 H=5.440 KL=0.0092 clip_frac=0.088
[PPO] it=   61 steps=  249856 avg10= -59.74 loss=0.102 pg=-0.004 vf=0.243 H=5.411 KL=0.0111 clip_frac=0.114
[PPO] it=   64 steps=  262144 avg10= -57.36 loss=8.860 pg=0.089 vf=17.575 H=5.397 KL=0.0665 clip_frac=0.163
[PPO] it=   67 steps=  274432 avg10=-120.99 loss=19.060 pg=0.002 vf=38.148 H=5.412 KL=0.0106 clip_frac=0.094
[PPO] it=   70 steps=  286720 avg10= -93.42 loss=3.928 pg=0.013 vf=7.862 H=5.415 KL=0.0157 clip_frac=0.147
[PPO] it=   73 steps=  299008 avg10= -77.97 loss=0.237 pg=0.001 vf=0.503 H=5.411 KL=0.0123 clip_frac=0.149
[PPO] it=   76 steps=  311296 avg10= -54.98 loss=0.130 pg=-0.002 vf=0.295 H=5.362 KL=0.0103 clip_frac=0.120
[PPO] it=   79 steps=  323584 avg10= -57.13 loss=0.174 pg=0.001 vf=0.379 H=5.312 KL=0.0096 clip_frac=0.107
[PPO] it=   82 steps=  335872 avg10= -53.47 loss=0.121 pg=-0.004 vf=0.282 H=5.308 KL=0.0098 clip_frac=0.118
[PPO] it=   85 steps=  348160 avg10= -48.08 loss=0.234 pg=-0.001 vf=0.501 H=5.282 KL=0.0093 clip_frac=0.097
[PPO] it=   88 steps=  360448 avg10= -35.38 loss=0.138 pg=-0.004 vf=0.316 H=5.240 KL=0.0102 clip_frac=0.144
[PPO] it=   91 steps=  372736 avg10= -23.82 loss=0.319 pg=-0.002 vf=0.672 H=5.208 KL=0.0100 clip_frac=0.098
[PPO] it=   94 steps=  385024 avg10= -15.36 loss=0.262 pg=-0.002 vf=0.559 H=5.194 KL=0.0109 clip_frac=0.120
[PPO] it=   97 steps=  397312 avg10=  -6.23 loss=0.154 pg=-0.000 vf=0.340 H=5.158 KL=0.0107 clip_frac=0.112
[PPO] it=  100 steps=  409600 avg10=   2.74 loss=0.213 pg=-0.001 vf=0.458 H=5.123 KL=0.0108 clip_frac=0.108
[PPO] it=  103 steps=  421888 avg10=  18.37 loss=0.546 pg=0.001 vf=1.122 H=5.107 KL=0.0107 clip_frac=0.106
[PPO] it=  106 steps=  434176 avg10=  31.42 loss=0.671 pg=-0.003 vf=1.378 H=5.074 KL=0.0107 clip_frac=0.123
[PPO] it=  109 steps=  446464 avg10=  41.95 loss=0.395 pg=-0.001 vf=0.821 H=5.036 KL=0.0110 clip_frac=0.125
[PPO] it=  112 steps=  458752 avg10=  41.88 loss=1.080 pg=0.035 vf=2.120 H=5.020 KL=0.0265 clip_frac=0.214
[PPO] it=  115 steps=  471040 avg10=  57.12 loss=0.638 pg=-0.001 vf=1.307 H=4.992 KL=0.0108 clip_frac=0.110
[PPO] it=  118 steps=  483328 avg10=  31.06 loss=0.360 pg=0.017 vf=0.716 H=4.966 KL=0.0159 clip_frac=0.178
[PPO] it=  121 steps=  495616 avg10=  63.98 loss=0.438 pg=-0.002 vf=0.909 H=4.919 KL=0.0128 clip_frac=0.129
[PPO] it=  124 steps=  507904 avg10=  43.17 loss=0.740 pg=0.016 vf=1.479 H=4.905 KL=0.0176 clip_frac=0.178
[PPO] it=  127 steps=  520192 avg10=  71.04 loss=0.442 pg=-0.003 vf=0.919 H=4.883 KL=0.0128 clip_frac=0.153
[PPO] it=  130 steps=  532480 avg10=  59.47 loss=0.413 pg=0.000 vf=0.854 H=4.860 KL=0.0118 clip_frac=0.133
[PPO] it=  133 steps=  544768 avg10=  67.89 loss=0.469 pg=-0.002 vf=0.972 H=4.869 KL=0.0107 clip_frac=0.128
[PPO] it=  136 steps=  557056 avg10=  78.41 loss=0.452 pg=-0.001 vf=0.934 H=4.843 KL=0.0116 clip_frac=0.142
[PPO] it=  139 steps=  569344 avg10=  84.51 loss=0.419 pg=0.000 vf=0.867 H=4.820 KL=0.0109 clip_frac=0.144
[PPO] it=  142 steps=  581632 avg10=  88.79 loss=0.750 pg=-0.002 vf=1.533 H=4.784 KL=0.0115 clip_frac=0.139
[PPO] it=  145 steps=  593920 avg10=  81.43 loss=8.440 pg=-0.009 vf=16.926 H=4.752 KL=0.4104 clip_frac=0.164
[PPO] it=  148 steps=  606208 avg10=  88.67 loss=0.523 pg=-0.000 vf=1.076 H=4.744 KL=0.0118 clip_frac=0.139
[PPO] it=  151 steps=  618496 avg10=  99.30 loss=1.419 pg=-0.002 vf=2.872 H=4.720 KL=0.0140 clip_frac=0.152
[PPO] it=  154 steps=  630784 avg10=  97.58 loss=1.047 pg=-0.002 vf=2.127 H=4.693 KL=0.0130 clip_frac=0.164
[PPO] it=  157 steps=  643072 avg10= 110.96 loss=0.627 pg=0.001 vf=1.280 H=4.670 KL=0.0135 clip_frac=0.156
[PPO] it=  160 steps=  655360 avg10= 111.16 loss=0.651 pg=0.003 vf=1.325 H=4.655 KL=0.0166 clip_frac=0.198
[PPO] it=  163 steps=  667648 avg10=  88.62 loss=10.674 pg=0.014 vf=21.349 H=4.660 KL=0.1578 clip_frac=0.155
[PPO] it=  166 steps=  679936 avg10=  99.50 loss=0.705 pg=0.058 vf=1.322 H=4.637 KL=0.0991 clip_frac=0.188
[PPO] it=  169 steps=  692224 avg10= 121.19 loss=0.518 pg=-0.000 vf=1.065 H=4.613 KL=0.0142 clip_frac=0.177
[PPO] it=  172 steps=  704512 avg10= 114.87 loss=0.581 pg=-0.000 vf=1.189 H=4.605 KL=0.0134 clip_frac=0.174
[PPO] it=  175 steps=  716800 avg10= 125.77 loss=0.945 pg=0.000 vf=1.918 H=4.541 KL=0.0121 clip_frac=0.168
[PPO] it=  178 steps=  729088 avg10= 129.76 loss=0.969 pg=-0.000 vf=1.966 H=4.554 KL=0.0120 clip_frac=0.160
[PPO] it=  181 steps=  741376 avg10= 102.58 loss=0.819 pg=-0.000 vf=1.667 H=4.558 KL=0.0147 clip_frac=0.177
[PPO] it=  184 steps=  753664 avg10= 135.20 loss=1.517 pg=0.004 vf=3.053 H=4.554 KL=0.0156 clip_frac=0.231
[PPO] it=  187 steps=  765952 avg10= 127.82 loss=0.755 pg=-0.002 vf=1.542 H=4.549 KL=0.0111 clip_frac=0.138
[PPO] it=  190 steps=  778240 avg10= 136.13 loss=0.689 pg=-0.005 vf=1.414 H=4.511 KL=0.0141 clip_frac=0.190
[PPO] it=  193 steps=  790528 avg10= 132.88 loss=0.740 pg=0.000 vf=1.506 H=4.483 KL=0.0136 clip_frac=0.169
[PPO] it=  196 steps=  800000 avg10= 140.97 loss=0.316 pg=-0.007 vf=0.672 H=4.479 KL=0.0090 clip_frac=0.116
[PPO] done steps=800000 time=1298.8s avg10=140.97
Saved BipedalWalker PPO run 18 model to a3_bonus_ppo_artifacts/bipedal_walker/run_18_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_18_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -89.97 steps 583
Eval episode 2 seed 1228 return 227.46 steps 1600
Eval episode 3 seed 1229 return 222.67 steps 1600
Eval episode 4 seed 1230 return -21.82 steps 687
Eval episode 5 seed 1231 return 185.04 steps 1600
Eval episode 6 seed 1232 return -139.41 steps 1544
Eval episode 7 seed 1233 return 210.35 steps 1600
Eval episode 8 seed 1234 return 204.03 steps 1600
Eval episode 9 seed 1235 return -55.89 steps 556
Eval episode 10 seed 1236 return -119.56 steps 362
Greedy evaluation mean 62.29  std 151.02
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_18_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_18_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=2, seed=1228, return=227.46, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_18_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 227.46 steps 1600 with seed 1228 into a3_bonus_ppo_artifacts/bipedal_walker/run_18_bipedal_ppo/videos
Replayed best episode for video: return=227.46, steps=1600

Run#19

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config
bipedal_cfg_run19 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run19 = build_ppo_continuous_model_from_config(
    bipedal_cfg_run19
).to(device)

# PPO hyperparameters (slightly smaller clip, no entropy)
bipedal_ppo_cfg_run19 = PPOUpdateConfig(
    clip_range=0.15,     # between the conservative 0.10 and aggressive 0.20
    value_coef=0.5,
    entropy_coef=0.0,    # pure exploitation at this stage
    max_grad_norm=0.5,
    n_epochs=10,         # keep strong optimization per batch
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run19 = "run_19_bipedal_ppo"
bipedal_run_dir_run19 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run19)
print(f"BipedalWalker PPO run 19 dir: {bipedal_run_dir_run19}")

# Training budget (same style as run 18 but a bit longer)
bipedal_total_steps_run19 = 1_000_000
bipedal_rollout_len_run19 = 4096

# Train PPO on BipedalWalker
bipedal_model_run19, bipedal_episode_returns_run19, bipedal_logs_run19 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run19,
    control_type="continuous",
    run_dir=bipedal_run_dir_run19,
    total_env_steps=bipedal_total_steps_run19,
    rollout_len=bipedal_rollout_len_run19,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run19,
    lr=2e-4,              # a bit smaller than run 18 (3e-4)
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run19, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run19, dtype=np.float32),
)

bipedal_model_path_run19 = os.path.join(bipedal_run_dir_run19, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run19.state_dict(), bipedal_model_path_run19)
print(f"Saved BipedalWalker PPO run 19 model to {bipedal_model_path_run19}")

# Training curve (learning curve)
plot_rewards(
    rewards=bipedal_episode_returns_run19,
    run_dir=bipedal_run_dir_run19,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 19)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run19 = os.path.join(bipedal_run_dir_run19, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run19 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run19,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run19,
)

# save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run19, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run19, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run19,
    run_dir=bipedal_run_dir_run19,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 19)",
    ma_window=3,
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run19,
    control_type="continuous",
    run_dir=bipedal_run_dir_run19,
    csv_path=csv_path_bipedal_run19,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 19 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_19_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-116.11 loss=50.330 pg=0.003 vf=100.654 H=5.678 KL=0.0119 clip_frac=0.143
[PPO] it=    4 steps=   16384 avg10=-116.78 loss=39.271 pg=0.002 vf=78.539 H=5.671 KL=0.0144 clip_frac=0.197
[PPO] it=    7 steps=   28672 avg10=-127.49 loss=68.351 pg=0.000 vf=136.703 H=5.676 KL=0.0117 clip_frac=0.167
[PPO] it=   10 steps=   40960 avg10=-120.05 loss=17.248 pg=0.003 vf=34.490 H=5.673 KL=0.0125 clip_frac=0.168
[PPO] it=   13 steps=   53248 avg10=-113.49 loss=7.611 pg=0.000 vf=15.221 H=5.643 KL=0.0151 clip_frac=0.201
[PPO] it=   16 steps=   65536 avg10=-113.02 loss=0.175 pg=0.002 vf=0.347 H=5.555 KL=0.0122 clip_frac=0.162
[PPO] it=   19 steps=   77824 avg10=-105.35 loss=0.198 pg=0.001 vf=0.395 H=5.497 KL=0.0113 clip_frac=0.143
[PPO] it=   22 steps=   90112 avg10=-102.12 loss=0.184 pg=0.001 vf=0.366 H=5.439 KL=0.0108 clip_frac=0.145
[PPO] it=   25 steps=  102400 avg10= -98.21 loss=13.817 pg=-0.007 vf=27.647 H=5.402 KL=0.0493 clip_frac=0.070
[PPO] it=   28 steps=  114688 avg10= -98.42 loss=0.307 pg=0.004 vf=0.606 H=5.417 KL=0.0144 clip_frac=0.202
[PPO] it=   31 steps=  126976 avg10= -85.35 loss=0.060 pg=-0.000 vf=0.121 H=5.338 KL=0.0102 clip_frac=0.124
[PPO] it=   34 steps=  139264 avg10= -79.62 loss=0.102 pg=-0.001 vf=0.207 H=5.278 KL=0.0094 clip_frac=0.103
[PPO] it=   37 steps=  151552 avg10= -77.31 loss=0.109 pg=0.000 vf=0.218 H=5.255 KL=0.0092 clip_frac=0.095
[PPO] it=   40 steps=  163840 avg10= -77.38 loss=0.105 pg=-0.002 vf=0.215 H=5.221 KL=0.0095 clip_frac=0.095
[PPO] it=   43 steps=  176128 avg10= -79.48 loss=3.859 pg=-0.002 vf=7.721 H=5.184 KL=0.2701 clip_frac=0.129
[PPO] it=   46 steps=  188416 avg10= -77.45 loss=0.137 pg=-0.003 vf=0.278 H=5.128 KL=0.0093 clip_frac=0.080
[PPO] it=   49 steps=  200704 avg10= -67.96 loss=0.114 pg=-0.004 vf=0.236 H=5.064 KL=0.0101 clip_frac=0.115
[PPO] it=   52 steps=  212992 avg10= -66.13 loss=0.219 pg=-0.003 vf=0.443 H=5.040 KL=0.0097 clip_frac=0.099
[PPO] it=   55 steps=  225280 avg10= -59.12 loss=0.154 pg=-0.005 vf=0.318 H=4.978 KL=0.0103 clip_frac=0.104
[PPO] it=   58 steps=  237568 avg10= -53.07 loss=0.124 pg=-0.006 vf=0.259 H=4.916 KL=0.0100 clip_frac=0.110
[PPO] it=   61 steps=  249856 avg10= -46.20 loss=0.236 pg=-0.006 vf=0.483 H=4.913 KL=0.0121 clip_frac=0.121
[PPO] it=   64 steps=  262144 avg10= -46.64 loss=0.126 pg=-0.005 vf=0.262 H=4.871 KL=0.0110 clip_frac=0.127
[PPO] it=   67 steps=  274432 avg10= -29.46 loss=0.293 pg=-0.004 vf=0.593 H=4.789 KL=0.0106 clip_frac=0.122
[PPO] it=   70 steps=  286720 avg10= -20.58 loss=0.186 pg=-0.006 vf=0.385 H=4.723 KL=0.0109 clip_frac=0.135
[PPO] it=   73 steps=  299008 avg10= -15.28 loss=0.151 pg=-0.007 vf=0.316 H=4.712 KL=0.0101 clip_frac=0.127
[PPO] it=   76 steps=  311296 avg10=  -1.51 loss=0.092 pg=-0.007 vf=0.198 H=4.638 KL=0.0112 clip_frac=0.140
[PPO] it=   79 steps=  323584 avg10=   4.11 loss=0.142 pg=-0.008 vf=0.299 H=4.587 KL=0.0115 clip_frac=0.140
[PPO] it=   82 steps=  335872 avg10=  13.32 loss=0.166 pg=-0.008 vf=0.349 H=4.556 KL=0.0128 clip_frac=0.153
[PPO] it=   85 steps=  348160 avg10=   9.35 loss=9.711 pg=-0.011 vf=19.443 H=4.505 KL=1.0940 clip_frac=0.175
[PPO] it=   88 steps=  360448 avg10=  15.15 loss=0.265 pg=-0.008 vf=0.546 H=4.490 KL=0.0148 clip_frac=0.182
[PPO] it=   91 steps=  372736 avg10=  30.87 loss=0.701 pg=-0.010 vf=1.423 H=4.440 KL=0.4509 clip_frac=0.194
[PPO] it=   94 steps=  385024 avg10=  20.74 loss=0.759 pg=-0.005 vf=1.528 H=4.448 KL=0.0170 clip_frac=0.200
[PPO] it=   97 steps=  397312 avg10=  37.47 loss=0.552 pg=-0.009 vf=1.122 H=4.400 KL=0.0152 clip_frac=0.189
[PPO] it=  100 steps=  409600 avg10= -97.89 loss=8.670 pg=-0.002 vf=17.345 H=4.364 KL=14.8926 clip_frac=0.534
[PPO] it=  103 steps=  421888 avg10=  -1.35 loss=23.928 pg=-0.001 vf=47.858 H=4.371 KL=0.1134 clip_frac=0.284
[PPO] it=  106 steps=  434176 avg10=  32.98 loss=0.979 pg=0.010 vf=1.938 H=4.348 KL=0.0350 clip_frac=0.279
[PPO] it=  109 steps=  446464 avg10=  56.59 loss=0.479 pg=-0.004 vf=0.966 H=4.369 KL=0.0142 clip_frac=0.203
[PPO] it=  112 steps=  458752 avg10=-101.21 loss=8.907 pg=-0.000 vf=17.815 H=4.356 KL=4.3667 clip_frac=0.384
[PPO] it=  115 steps=  471040 avg10=  23.43 loss=0.726 pg=-0.006 vf=1.463 H=4.326 KL=0.0153 clip_frac=0.215
[PPO] it=  118 steps=  483328 avg10=  71.54 loss=0.923 pg=-0.004 vf=1.854 H=4.274 KL=0.0161 clip_frac=0.205
[PPO] it=  121 steps=  495616 avg10=  77.62 loss=0.640 pg=-0.006 vf=1.292 H=4.260 KL=0.0148 clip_frac=0.207
[PPO] it=  124 steps=  507904 avg10=  -1.88 loss=8.561 pg=0.007 vf=17.108 H=4.213 KL=2.9173 clip_frac=0.346
[PPO] it=  127 steps=  520192 avg10=   7.44 loss=1.144 pg=-0.004 vf=2.296 H=4.221 KL=0.0159 clip_frac=0.249
[PPO] it=  130 steps=  532480 avg10=  79.89 loss=0.944 pg=-0.004 vf=1.895 H=4.206 KL=0.0164 clip_frac=0.213
[PPO] it=  133 steps=  544768 avg10=  95.39 loss=0.785 pg=-0.005 vf=1.580 H=4.171 KL=0.0168 clip_frac=0.238
[PPO] it=  136 steps=  557056 avg10=  98.53 loss=1.068 pg=-0.004 vf=2.144 H=4.157 KL=0.0181 clip_frac=0.240
[PPO] it=  139 steps=  569344 avg10=  -3.59 loss=0.846 pg=-0.002 vf=1.697 H=4.141 KL=0.0255 clip_frac=0.278
[PPO] it=  142 steps=  581632 avg10= 122.45 loss=0.855 pg=-0.003 vf=1.716 H=4.157 KL=0.0218 clip_frac=0.278
[PPO] it=  145 steps=  593920 avg10= 118.90 loss=0.865 pg=-0.005 vf=1.739 H=4.133 KL=0.0189 clip_frac=0.250
[PPO] it=  148 steps=  606208 avg10= -19.21 loss=8.588 pg=-0.026 vf=17.229 H=4.070 KL=7.6605 clip_frac=0.438
[PPO] it=  151 steps=  618496 avg10=  57.13 loss=1.313 pg=-0.006 vf=2.637 H=4.038 KL=0.0191 clip_frac=0.258
[PPO] it=  154 steps=  630784 avg10= 133.96 loss=0.701 pg=-0.007 vf=1.416 H=4.003 KL=0.0200 clip_frac=0.280
[PPO] it=  157 steps=  643072 avg10= 141.29 loss=0.851 pg=-0.007 vf=1.716 H=3.976 KL=0.0168 clip_frac=0.250
[PPO] it=  160 steps=  655360 avg10= -57.97 loss=1.658 pg=0.061 vf=3.194 H=3.946 KL=1.6538 clip_frac=0.363
[PPO] it=  163 steps=  667648 avg10= 168.83 loss=1.343 pg=-0.005 vf=2.695 H=3.869 KL=0.0191 clip_frac=0.247
[PPO] it=  166 steps=  679936 avg10= 173.91 loss=1.514 pg=-0.003 vf=3.035 H=3.794 KL=0.0193 clip_frac=0.265
[PPO] it=  169 steps=  692224 avg10= 177.61 loss=1.584 pg=0.002 vf=3.164 H=3.753 KL=0.0238 clip_frac=0.285
[PPO] it=  172 steps=  704512 avg10=  61.15 loss=1.623 pg=-0.003 vf=3.252 H=3.753 KL=0.0208 clip_frac=0.313
[PPO] it=  175 steps=  716800 avg10= 179.22 loss=1.256 pg=-0.005 vf=2.523 H=3.726 KL=0.0270 clip_frac=0.281
[PPO] it=  178 steps=  729088 avg10= 180.58 loss=1.458 pg=-0.003 vf=2.921 H=3.693 KL=0.0211 clip_frac=0.281
[PPO] it=  181 steps=  741376 avg10= -26.12 loss=1.376 pg=0.022 vf=2.708 H=3.647 KL=0.3085 clip_frac=0.288
[PPO] it=  184 steps=  753664 avg10= 184.74 loss=0.970 pg=-0.004 vf=1.947 H=3.666 KL=0.0204 clip_frac=0.310
[PPO] it=  187 steps=  765952 avg10= 189.84 loss=1.526 pg=-0.002 vf=3.058 H=3.675 KL=0.0186 clip_frac=0.279
[PPO] it=  190 steps=  778240 avg10= -81.96 loss=12.706 pg=-0.049 vf=25.511 H=3.657 KL=35.6240 clip_frac=0.596
[PPO] it=  193 steps=  790528 avg10= 137.77 loss=1.052 pg=-0.005 vf=2.115 H=3.625 KL=0.0212 clip_frac=0.289
[PPO] it=  196 steps=  802816 avg10= 194.48 loss=1.228 pg=-0.005 vf=2.465 H=3.589 KL=0.0199 clip_frac=0.303
[PPO] it=  199 steps=  815104 avg10=-115.14 loss=10.066 pg=-0.015 vf=20.163 H=3.571 KL=11.9287 clip_frac=0.610
[PPO] it=  202 steps=  827392 avg10= 121.08 loss=1.697 pg=0.001 vf=3.392 H=3.526 KL=0.0281 clip_frac=0.328
[PPO] it=  205 steps=  839680 avg10= 165.56 loss=1.579 pg=-0.000 vf=3.159 H=3.538 KL=0.0347 clip_frac=0.323
[PPO] it=  208 steps=  851968 avg10= -80.01 loss=8.743 pg=-0.049 vf=17.586 H=3.475 KL=39.6341 clip_frac=0.626
[PPO] it=  211 steps=  864256 avg10= 196.32 loss=3.034 pg=-0.001 vf=6.070 H=3.466 KL=0.0266 clip_frac=0.319
[PPO] it=  214 steps=  876544 avg10= 232.81 loss=2.039 pg=-0.002 vf=4.081 H=3.470 KL=0.0245 clip_frac=0.319
[PPO] it=  217 steps=  888832 avg10= -47.10 loss=2.847 pg=0.015 vf=5.663 H=3.443 KL=2.2104 clip_frac=0.396
[PPO] it=  220 steps=  901120 avg10= 230.47 loss=2.484 pg=-0.001 vf=4.971 H=3.425 KL=0.0282 clip_frac=0.343
[PPO] it=  223 steps=  913408 avg10= 236.67 loss=1.723 pg=0.000 vf=3.445 H=3.458 KL=0.0214 clip_frac=0.329
[PPO] it=  226 steps=  925696 avg10=  24.78 loss=3.139 pg=0.015 vf=6.249 H=3.444 KL=0.3799 clip_frac=0.307
[PPO] it=  229 steps=  937984 avg10= 240.58 loss=2.504 pg=0.001 vf=5.005 H=3.437 KL=0.0247 clip_frac=0.358
[PPO] it=  232 steps=  950272 avg10=-115.07 loss=2.385 pg=-0.054 vf=4.876 H=3.435 KL=41.4744 clip_frac=0.584
[PPO] it=  235 steps=  962560 avg10= 134.78 loss=3.099 pg=0.002 vf=6.194 H=3.445 KL=0.0260 clip_frac=0.342
[PPO] it=  238 steps=  974848 avg10= 236.28 loss=3.522 pg=0.002 vf=7.039 H=3.438 KL=0.1716 clip_frac=0.340
[PPO] it=  241 steps=  987136 avg10=  25.32 loss=3.116 pg=0.027 vf=6.178 H=3.464 KL=0.4610 clip_frac=0.410
[PPO] it=  244 steps=  999424 avg10= 245.31 loss=3.755 pg=-0.004 vf=7.517 H=3.453 KL=0.0263 clip_frac=0.337
[PPO] it=  245 steps= 1000000 avg10= 245.31 loss=0.323 pg=-0.017 vf=0.680 H=3.456 KL=0.0215 clip_frac=0.259
[PPO] done steps=1000000 time=2334.5s avg10=245.31
Saved BipedalWalker PPO run 19 model to a3_bonus_ppo_artifacts/bipedal_walker/run_19_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_19_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return 253.84 steps 1338
Eval episode 2 seed 1228 return 243.42 steps 1523
Eval episode 3 seed 1229 return 256.67 steps 1309
Eval episode 4 seed 1230 return 261.42 steps 1277
Eval episode 5 seed 1231 return 262.61 steps 1240
Eval episode 6 seed 1232 return 257.09 steps 1336
Eval episode 7 seed 1233 return 258.91 steps 1314
Eval episode 8 seed 1234 return 257.32 steps 1306
Eval episode 9 seed 1235 return 261.13 steps 1285
Eval episode 10 seed 1236 return 260.01 steps 1272
Greedy evaluation mean 257.24  std 5.24
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_19_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_19_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=5, seed=1231, return=262.61, steps=1240
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_19_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return 262.61 steps 1240 with seed 1231 into a3_bonus_ppo_artifacts/bipedal_walker/run_19_bipedal_ppo/videos
Replayed best episode for video: return=262.61, steps=1240

Run#20

In [ ]:
# Discover BipedalWalker dimensions
tmp_env = make_env(BIPEDAL_ENV_ID, worker_id=0, base_seed=SEED)
obs_dim_bipedal = tmp_env.observation_space.shape[0]
act_dim_bipedal = tmp_env.action_space.shape[0]
tmp_env.close()

print(f"BipedalWalker obs_dim={obs_dim_bipedal}, act_dim={act_dim_bipedal}")

# PPO model config (same network as before)
bipedal_cfg_run20 = PPOContinuousModelConfig(
    obs_dim=obs_dim_bipedal,
    act_dim=act_dim_bipedal,
    hidden_sizes=(256, 256),
)

bipedal_model_run20 = build_ppo_continuous_model_from_config(
    bipedal_cfg_run20
).to(device)


bipedal_ppo_cfg_run20 = PPOUpdateConfig(
    clip_range=0.08,
    value_coef=0.5,
    entropy_coef=0.0005,
    max_grad_norm=0.5,
    n_epochs=4,
    batch_size=64,
    normalize_adv=True,
)

# Run name / directory
bipedal_run_name_run20 = "run_20_bipedal_ppo"
bipedal_run_dir_run20 = make_run_dir(BIPEDAL_ROOT, bipedal_run_name_run20)
print(f"BipedalWalker PPO run 20 dir: {bipedal_run_dir_run20}")

# Training budget (same as run 19)
bipedal_total_steps_run20 = 1_000_000
bipedal_rollout_len_run20 = 4096

# Train PPO on BipedalWalker
bipedal_model_run20, bipedal_episode_returns_run20, bipedal_logs_run20 = train_ppo_single_env(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run20,
    control_type="continuous",
    run_dir=bipedal_run_dir_run20,
    total_env_steps=bipedal_total_steps_run20,
    rollout_len=bipedal_rollout_len_run20,
    gamma=0.99,
    ppo_cfg=bipedal_ppo_cfg_run20,
    lr=2.5e-4,          # slightly smaller than run 19 for stability
    log_every=10_000,
)

# Save training returns and model
np.save(
    os.path.join(bipedal_run_dir_run20, "ppo_bipedal_episode_returns.npy"),
    np.array(bipedal_episode_returns_run20, dtype=np.float32),
)

bipedal_model_path_run20 = os.path.join(bipedal_run_dir_run20, "ppo_bipedal_model.pth")
torch.save(bipedal_model_run20.state_dict(), bipedal_model_path_run20)
print(f"Saved BipedalWalker PPO run 20 model to {bipedal_model_path_run20}")

# Training curve
plot_rewards(
    rewards=bipedal_episode_returns_run20,
    run_dir=bipedal_run_dir_run20,
    filename="ppo_bipedal_train_rewards.png",
    title="BipedalWalker PPO training episode returns (run 20)",
    ma_window=20,
)

# Greedy evaluation with CSV logging (10 episodes)
csv_path_bipedal_run20 = os.path.join(bipedal_run_dir_run20, "ppo_bipedal_eval_log.csv")

bipedal_eval_returns_run20 = evaluate_greedy(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run20,
    control_type="continuous",
    n_episodes=10,
    max_steps=1600,
    base_seed=SEED,
    csv_path=csv_path_bipedal_run20,
)

# save eval .npy
np.save(
    os.path.join(bipedal_run_dir_run20, "ppo_bipedal_eval_returns.npy"),
    np.array(bipedal_eval_returns_run20, dtype=np.float32),
)

# Eval plot
plot_rewards(
    rewards=bipedal_eval_returns_run20,
    run_dir=bipedal_run_dir_run20,
    filename="ppo_bipedal_eval_rewards.png",
    title="BipedalWalker PPO greedy evaluation returns (run 20)",
    ma_window=3,
)

# Record video of the best greedy evaluation episode
record_best_greedy_from_csv(
    env_id=BIPEDAL_ENV_ID,
    model=bipedal_model_run20,
    control_type="continuous",
    run_dir=bipedal_run_dir_run20,
    csv_path=csv_path_bipedal_run20,
    max_steps=1600,
)
BipedalWalker obs_dim=24, act_dim=4
BipedalWalker PPO run 20 dir: a3_bonus_ppo_artifacts/bipedal_walker/run_20_bipedal_ppo
[PPO] it=    1 steps=    4096 avg10=-115.47 loss=173.586 pg=0.007 vf=347.164 H=5.683 KL=0.0092 clip_frac=0.243
[PPO] it=    4 steps=   16384 avg10=-111.67 loss=59.858 pg=0.009 vf=119.704 H=5.683 KL=0.0069 clip_frac=0.135
[PPO] it=    7 steps=   28672 avg10=-112.48 loss=90.320 pg=0.016 vf=180.613 H=5.675 KL=0.0122 clip_frac=0.243
[PPO] it=   10 steps=   40960 avg10=-119.41 loss=27.074 pg=0.007 vf=54.138 H=5.680 KL=0.0100 clip_frac=0.366
[PPO] it=   13 steps=   53248 avg10=-111.22 loss=38.384 pg=0.011 vf=76.751 H=5.659 KL=0.0138 clip_frac=0.427
[PPO] it=   16 steps=   65536 avg10=-125.89 loss=33.276 pg=0.013 vf=66.531 H=5.631 KL=0.0148 clip_frac=0.429
[PPO] it=   19 steps=   77824 avg10=-122.26 loss=51.593 pg=0.008 vf=103.174 H=5.630 KL=0.0115 clip_frac=0.303
[PPO] it=   22 steps=   90112 avg10=-108.21 loss=29.363 pg=0.007 vf=58.719 H=5.642 KL=0.0133 clip_frac=0.404
[PPO] it=   25 steps=  102400 avg10=-105.19 loss=29.709 pg=0.013 vf=59.397 H=5.658 KL=0.0196 clip_frac=0.499
[PPO] it=   28 steps=  114688 avg10=-112.87 loss=29.133 pg=0.009 vf=58.252 H=5.655 KL=0.0124 clip_frac=0.396
[PPO] it=   31 steps=  126976 avg10=-151.80 loss=65.733 pg=0.014 vf=131.445 H=5.643 KL=0.0160 clip_frac=0.492
[PPO] it=   34 steps=  139264 avg10=-110.97 loss=13.168 pg=0.024 vf=26.294 H=5.672 KL=0.0322 clip_frac=0.623
[PPO] it=   37 steps=  151552 avg10=-123.54 loss=26.969 pg=0.008 vf=53.928 H=5.690 KL=0.0119 clip_frac=0.360
[PPO] it=   40 steps=  163840 avg10=-120.13 loss=16.520 pg=0.028 vf=32.990 H=5.709 KL=0.0281 clip_frac=0.628
[PPO] it=   43 steps=  176128 avg10=-131.05 loss=0.507 pg=0.028 vf=0.963 H=5.720 KL=0.0522 clip_frac=0.726
[PPO] it=   46 steps=  188416 avg10=-132.82 loss=0.245 pg=0.017 vf=0.462 H=5.710 KL=0.0186 clip_frac=0.554
[PPO] it=   49 steps=  200704 avg10=-123.38 loss=12.088 pg=0.011 vf=24.160 H=5.710 KL=0.0149 clip_frac=0.467
[PPO] it=   52 steps=  212992 avg10=-121.12 loss=0.159 pg=0.018 vf=0.287 H=5.725 KL=0.0375 clip_frac=0.655
[PPO] it=   55 steps=  225280 avg10=-125.79 loss=0.352 pg=0.012 vf=0.686 H=5.715 KL=0.0248 clip_frac=0.606
[PPO] it=   58 steps=  237568 avg10=-126.31 loss=11.939 pg=0.006 vf=23.872 H=5.682 KL=0.0126 clip_frac=0.436
[PPO] it=   61 steps=  249856 avg10=-128.57 loss=0.103 pg=0.015 vf=0.182 H=5.671 KL=0.0300 clip_frac=0.656
[PPO] it=   64 steps=  262144 avg10=-129.27 loss=8.766 pg=0.016 vf=17.506 H=5.668 KL=0.0197 clip_frac=0.537
[PPO] it=   67 steps=  274432 avg10=-128.45 loss=0.082 pg=0.018 vf=0.134 H=5.677 KL=0.0219 clip_frac=0.569
[PPO] it=   70 steps=  286720 avg10=-128.38 loss=0.160 pg=0.012 vf=0.301 H=5.670 KL=0.0183 clip_frac=0.539
[PPO] it=   73 steps=  299008 avg10=-125.72 loss=8.189 pg=0.012 vf=16.359 H=5.698 KL=0.0134 clip_frac=0.446
[PPO] it=   76 steps=  311296 avg10=-122.86 loss=0.279 pg=0.012 vf=0.539 H=5.707 KL=0.0224 clip_frac=0.572
[PPO] it=   79 steps=  323584 avg10=-119.45 loss=0.102 pg=0.014 vf=0.181 H=5.705 KL=0.0183 clip_frac=0.540
[PPO] it=   82 steps=  335872 avg10=-119.53 loss=16.985 pg=0.006 vf=33.962 H=5.701 KL=0.0120 clip_frac=0.418
[PPO] it=   85 steps=  348160 avg10=-117.38 loss=11.755 pg=0.010 vf=23.494 H=5.699 KL=0.0177 clip_frac=0.396
[PPO] it=   88 steps=  360448 avg10=-128.23 loss=0.049 pg=0.012 vf=0.079 H=5.723 KL=0.0181 clip_frac=0.538
[PPO] it=   91 steps=  372736 avg10=-133.34 loss=0.154 pg=0.010 vf=0.295 H=5.747 KL=0.0150 clip_frac=0.484
[PPO] it=   94 steps=  385024 avg10=-130.71 loss=5.126 pg=0.010 vf=10.238 H=5.726 KL=0.0136 clip_frac=0.433
[PPO] it=   97 steps=  397312 avg10=-130.27 loss=0.853 pg=0.014 vf=1.683 H=5.732 KL=0.0262 clip_frac=0.598
[PPO] it=  100 steps=  409600 avg10=-134.67 loss=0.123 pg=0.008 vf=0.235 H=5.724 KL=0.0136 clip_frac=0.453
[PPO] it=  103 steps=  421888 avg10=-127.41 loss=16.024 pg=0.004 vf=32.045 H=5.699 KL=0.0090 clip_frac=0.271
[PPO] it=  106 steps=  434176 avg10=-129.66 loss=0.043 pg=0.013 vf=0.067 H=5.702 KL=0.0148 clip_frac=0.480
[PPO] it=  109 steps=  446464 avg10=-133.07 loss=0.043 pg=0.007 vf=0.079 H=5.670 KL=0.0102 clip_frac=0.390
[PPO] it=  112 steps=  458752 avg10=-131.08 loss=12.478 pg=0.006 vf=24.948 H=5.649 KL=0.0121 clip_frac=0.357
[PPO] it=  115 steps=  471040 avg10=-130.71 loss=0.030 pg=0.011 vf=0.043 H=5.639 KL=0.0132 clip_frac=0.428
[PPO] it=  118 steps=  483328 avg10=-127.25 loss=6.008 pg=0.004 vf=12.014 H=5.616 KL=0.0104 clip_frac=0.340
[PPO] it=  121 steps=  495616 avg10=-124.26 loss=0.035 pg=0.009 vf=0.058 H=5.576 KL=0.0104 clip_frac=0.384
[PPO] it=  124 steps=  507904 avg10=-124.40 loss=0.043 pg=0.009 vf=0.075 H=5.579 KL=0.0117 clip_frac=0.412
[PPO] it=  127 steps=  520192 avg10=-123.79 loss=6.248 pg=0.010 vf=12.481 H=5.577 KL=0.0120 clip_frac=0.349
[PPO] it=  130 steps=  532480 avg10=-124.52 loss=0.151 pg=0.006 vf=0.296 H=5.535 KL=0.0138 clip_frac=0.481
[PPO] it=  133 steps=  544768 avg10=-124.15 loss=0.176 pg=0.009 vf=0.341 H=5.503 KL=0.0140 clip_frac=0.464
[PPO] it=  136 steps=  557056 avg10=-124.92 loss=0.076 pg=0.009 vf=0.140 H=5.458 KL=0.0117 clip_frac=0.411
[PPO] it=  139 steps=  569344 avg10=-122.19 loss=0.032 pg=0.007 vf=0.054 H=5.438 KL=0.0114 clip_frac=0.366
[PPO] it=  142 steps=  581632 avg10=-121.07 loss=0.094 pg=0.007 vf=0.179 H=5.403 KL=0.0151 clip_frac=0.474
[PPO] it=  145 steps=  593920 avg10=-117.51 loss=0.078 pg=0.008 vf=0.147 H=5.373 KL=0.0098 clip_frac=0.351
[PPO] it=  148 steps=  606208 avg10=-114.69 loss=0.055 pg=0.006 vf=0.103 H=5.363 KL=0.0101 clip_frac=0.354
[PPO] it=  151 steps=  618496 avg10=-116.42 loss=0.036 pg=0.005 vf=0.067 H=5.332 KL=0.0090 clip_frac=0.308
[PPO] it=  154 steps=  630784 avg10=-114.94 loss=14.610 pg=0.005 vf=29.217 H=5.325 KL=0.0094 clip_frac=0.269
[PPO] it=  157 steps=  643072 avg10=-111.39 loss=0.052 pg=0.006 vf=0.097 H=5.318 KL=0.0110 clip_frac=0.356
[PPO] it=  160 steps=  655360 avg10=-116.18 loss=0.069 pg=0.006 vf=0.130 H=5.311 KL=0.0111 clip_frac=0.375
[PPO] it=  163 steps=  667648 avg10=-114.77 loss=0.030 pg=0.004 vf=0.057 H=5.271 KL=0.0089 clip_frac=0.309
[PPO] it=  166 steps=  679936 avg10=-114.07 loss=15.444 pg=0.007 vf=30.879 H=5.230 KL=0.0094 clip_frac=0.277
[PPO] it=  169 steps=  692224 avg10=-112.08 loss=19.602 pg=0.010 vf=39.188 H=5.198 KL=0.0089 clip_frac=0.206
[PPO] it=  172 steps=  704512 avg10=-113.46 loss=0.099 pg=0.004 vf=0.195 H=5.159 KL=0.0112 clip_frac=0.374
[PPO] it=  175 steps=  716800 avg10=-110.53 loss=0.048 pg=0.004 vf=0.091 H=5.139 KL=0.0095 clip_frac=0.333
[PPO] it=  178 steps=  729088 avg10=-114.21 loss=0.317 pg=0.005 vf=0.629 H=5.128 KL=0.0100 clip_frac=0.341
[PPO] it=  181 steps=  741376 avg10=-110.75 loss=0.287 pg=0.007 vf=0.563 H=5.095 KL=0.0122 clip_frac=0.386
[PPO] it=  184 steps=  753664 avg10=-103.04 loss=15.637 pg=0.004 vf=31.270 H=5.087 KL=0.0114 clip_frac=0.269
[PPO] it=  187 steps=  765952 avg10=-105.16 loss=0.127 pg=0.009 vf=0.240 H=5.083 KL=0.0135 clip_frac=0.439
[PPO] it=  190 steps=  778240 avg10=-107.81 loss=0.053 pg=0.007 vf=0.097 H=5.052 KL=0.0102 clip_frac=0.317
[PPO] it=  193 steps=  790528 avg10=-103.98 loss=0.042 pg=0.003 vf=0.083 H=5.009 KL=0.0077 clip_frac=0.246
[PPO] it=  196 steps=  802816 avg10= -99.70 loss=0.328 pg=0.002 vf=0.656 H=4.982 KL=0.0079 clip_frac=0.260
[PPO] it=  199 steps=  815104 avg10= -94.80 loss=0.062 pg=0.004 vf=0.121 H=4.965 KL=0.0082 clip_frac=0.306
[PPO] it=  202 steps=  827392 avg10=-101.38 loss=4.503 pg=0.008 vf=8.996 H=4.956 KL=0.0091 clip_frac=0.241
[PPO] it=  205 steps=  839680 avg10= -98.37 loss=0.049 pg=0.002 vf=0.099 H=4.925 KL=0.0089 clip_frac=0.296
[PPO] it=  208 steps=  851968 avg10= -93.82 loss=0.044 pg=0.004 vf=0.085 H=4.916 KL=0.0089 clip_frac=0.272
[PPO] it=  211 steps=  864256 avg10= -89.38 loss=0.034 pg=0.003 vf=0.066 H=4.898 KL=0.0083 clip_frac=0.260
[PPO] it=  214 steps=  876544 avg10= -92.11 loss=15.641 pg=0.014 vf=31.260 H=4.864 KL=0.0159 clip_frac=0.276
[PPO] it=  217 steps=  888832 avg10= -88.50 loss=0.047 pg=0.003 vf=0.092 H=4.838 KL=0.0078 clip_frac=0.242
[PPO] it=  220 steps=  901120 avg10= -87.43 loss=0.041 pg=0.003 vf=0.080 H=4.797 KL=0.0078 clip_frac=0.250
[PPO] it=  223 steps=  913408 avg10= -86.10 loss=0.211 pg=0.004 vf=0.418 H=4.797 KL=0.0080 clip_frac=0.253
[PPO] it=  226 steps=  925696 avg10= -86.78 loss=0.047 pg=0.003 vf=0.093 H=4.788 KL=0.0074 clip_frac=0.222
[PPO] it=  229 steps=  937984 avg10= -83.84 loss=0.048 pg=0.003 vf=0.095 H=4.759 KL=0.0067 clip_frac=0.235
[PPO] it=  232 steps=  950272 avg10= -83.05 loss=0.082 pg=0.002 vf=0.165 H=4.765 KL=0.0069 clip_frac=0.184
[PPO] it=  235 steps=  962560 avg10= -85.21 loss=0.074 pg=0.000 vf=0.152 H=4.740 KL=0.0063 clip_frac=0.173
[PPO] it=  238 steps=  974848 avg10= -82.84 loss=0.063 pg=0.000 vf=0.130 H=4.723 KL=0.0058 clip_frac=0.168
[PPO] it=  241 steps=  987136 avg10= -85.84 loss=10.075 pg=0.021 vf=20.113 H=4.711 KL=0.1714 clip_frac=0.539
[PPO] it=  244 steps=  999424 avg10= -88.89 loss=16.342 pg=0.021 vf=32.646 H=4.702 KL=0.0214 clip_frac=0.319
[PPO] it=  245 steps= 1000000 avg10= -83.67 loss=0.849 pg=0.021 vf=1.659 H=4.704 KL=0.0117 clip_frac=0.256
[PPO] done steps=1000000 time=1597.6s avg10=-83.67
Saved BipedalWalker PPO run 20 model to a3_bonus_ppo_artifacts/bipedal_walker/run_20_bipedal_ppo/ppo_bipedal_model.pth
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_20_bipedal_ppo/ppo_bipedal_train_rewards.png
Eval episode 1 seed 1227 return -90.96 steps 1600
Eval episode 2 seed 1228 return -88.59 steps 1600
Eval episode 3 seed 1229 return -88.96 steps 1600
Eval episode 4 seed 1230 return -89.87 steps 1600
Eval episode 5 seed 1231 return -88.97 steps 1600
Eval episode 6 seed 1232 return -88.59 steps 1600
Eval episode 7 seed 1233 return -88.99 steps 1600
Eval episode 8 seed 1234 return -88.59 steps 1600
Eval episode 9 seed 1235 return -89.07 steps 1600
Eval episode 10 seed 1236 return -90.12 steps 1600
Greedy evaluation mean -89.27  std 0.75
Saved greedy eval log to a3_bonus_ppo_artifacts/bipedal_walker/run_20_bipedal_ppo/ppo_bipedal_eval_log.csv
No description has been provided for this image
Saved plot to a3_bonus_ppo_artifacts/bipedal_walker/run_20_bipedal_ppo/ppo_bipedal_eval_rewards.png
Best eval episode from CSV: ep=2, seed=1228, return=-88.59, steps=1600
/usr/local/lib/python3.12/dist-packages/gymnasium/wrappers/rendering.py:293: UserWarning: WARN: Overwriting existing videos at /content/a3_bonus_ppo_artifacts/bipedal_walker/run_20_bipedal_ppo/videos folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)
  logger.warn(
Recorded greedy PPO episode return -88.59 steps 1600 with seed 1228 into a3_bonus_ppo_artifacts/bipedal_walker/run_20_bipedal_ppo/videos
Replayed best episode for video: return=-88.59, steps=1600